From 5bbe116f8970fdc3142ba3923dbce13e67b592a3 Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Thu, 27 Feb 2020 17:45:28 +0300 Subject: [PATCH 1/6] Track Hetero execution for nGraph networks using ngraph::Function --- modules/dnn/src/ie_ngraph.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/modules/dnn/src/ie_ngraph.cpp b/modules/dnn/src/ie_ngraph.cpp index e8cfd1265e..cdb0305dc9 100644 --- a/modules/dnn/src/ie_ngraph.cpp +++ b/modules/dnn/src/ie_ngraph.cpp @@ -442,13 +442,14 @@ void InfEngineNgraphNet::initPlugin(InferenceEngine::CNNNetwork& net) config.emplace("VPU_DETECT_NETWORK_BATCH", CONFIG_VALUE(NO)); } - bool isHetero = false; - if (device_name != "CPU") + bool isHetero = device_name == "FPGA"; + // It is actual only for non-CPU targets and networks built in runtime using nGraph. + // We do not check IR models because they can be with version less than IRv10 + if (!isHetero && device_name != "CPU" && !hasNetOwner) { - isHetero = device_name == "FPGA"; - for (auto& layer : net) + for (auto& node : net.getFunction()->get_ops()) { - if (layer->type == kOpenCVLayersType) + if (node->description() == kOpenCVLayersType) { isHetero = true; break; From edcc762f7a6b4f801acd7cbed9471dcc0afcb84d Mon Sep 17 00:00:00 2001 From: Yuriy Obukh Date: Fri, 28 Feb 2020 00:03:38 +0200 Subject: [PATCH 2/6] add "Copy to clipboard functional" to imshow wnd with Qt --- modules/highgui/src/window_QT.cpp | 35 ++++++++++++++++++++++++------- modules/highgui/src/window_QT.h | 3 +++ modules/highgui/src/window_QT.qrc | 1 + 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/modules/highgui/src/window_QT.cpp b/modules/highgui/src/window_QT.cpp index e83a442c6a..d77975f9ba 100644 --- a/modules/highgui/src/window_QT.cpp +++ b/modules/highgui/src/window_QT.cpp @@ -1856,7 +1856,7 @@ void CvWindow::displayStatusBar(QString text, int delayms) void CvWindow::enablePropertiesButton() { if (!vect_QActions.empty()) - vect_QActions[9]->setDisabled(false); + vect_QActions[10]->setDisabled(false); } @@ -1991,7 +1991,7 @@ void CvWindow::createView() void CvWindow::createActions() { - vect_QActions.resize(10); + vect_QActions.resize(11); QWidget* view = myView->getWidget(); @@ -2032,18 +2032,22 @@ void CvWindow::createActions() vect_QActions[8]->setIconVisibleInMenu(true); QObject::connect(vect_QActions[8], SIGNAL(triggered()), view, SLOT(saveView())); - vect_QActions[9] = new QAction(QIcon(":/properties-icon"), "Display properties window (CTRL+P)", this); + vect_QActions[9] = new QAction(QIcon(":/copy_clipbrd-icon"), "Copy image to clipboard (CTRL+C)", this); vect_QActions[9]->setIconVisibleInMenu(true); - QObject::connect(vect_QActions[9], SIGNAL(triggered()), this, SLOT(displayPropertiesWin())); + QObject::connect(vect_QActions[9], SIGNAL(triggered()), view, SLOT(copy2Clipbrd())); + + vect_QActions[10] = new QAction(QIcon(":/properties-icon"), "Display properties window (CTRL+P)", this); + vect_QActions[10]->setIconVisibleInMenu(true); + QObject::connect(vect_QActions[10], SIGNAL(triggered()), this, SLOT(displayPropertiesWin())); if (global_control_panel->myLayout->count() == 0) - vect_QActions[9]->setDisabled(true); + vect_QActions[10]->setDisabled(true); } void CvWindow::createShortcuts() { - vect_QShortcuts.resize(10); + vect_QShortcuts.resize(11); QWidget* view = myView->getWidget(); @@ -2074,8 +2078,11 @@ void CvWindow::createShortcuts() vect_QShortcuts[8] = new QShortcut(shortcut_save_img, this); QObject::connect(vect_QShortcuts[8], SIGNAL(activated()), view, SLOT(saveView())); - vect_QShortcuts[9] = new QShortcut(shortcut_properties_win, this); - QObject::connect(vect_QShortcuts[9], SIGNAL(activated()), this, SLOT(displayPropertiesWin())); + vect_QShortcuts[9] = new QShortcut(shortcut_copy_clipbrd, this); + QObject::connect(vect_QShortcuts[9], SIGNAL(activated()), view, SLOT(copy2Clipbrd())); + + vect_QShortcuts[10] = new QShortcut(shortcut_properties_win, this); + QObject::connect(vect_QShortcuts[10], SIGNAL(activated()), this, SLOT(displayPropertiesWin())); } @@ -2698,6 +2705,18 @@ void DefaultViewPort::saveView() } +//copy image to clipboard +void DefaultViewPort::copy2Clipbrd() +{ + // Create a new pixmap to render the viewport into + QPixmap viewportPixmap(viewport()->size()); + viewport()->render(&viewportPixmap); + + QClipboard *pClipboard = QApplication::clipboard(); + pClipboard->setPixmap(viewportPixmap); +} + + void DefaultViewPort::contextMenuEvent(QContextMenuEvent* evnt) { if (centralWidget->vect_QActions.size() > 0) diff --git a/modules/highgui/src/window_QT.h b/modules/highgui/src/window_QT.h index b132f41ede..dbeacf2edf 100644 --- a/modules/highgui/src/window_QT.h +++ b/modules/highgui/src/window_QT.h @@ -76,6 +76,7 @@ #include #include #include +#include #include #include @@ -91,6 +92,7 @@ enum { CV_MODE_NORMAL = 0, CV_MODE_OPENGL = 1 }; enum { shortcut_zoom_normal = Qt::CTRL + Qt::Key_Z, shortcut_zoom_imgRegion = Qt::CTRL + Qt::Key_X, shortcut_save_img = Qt::CTRL + Qt::Key_S, + shortcut_copy_clipbrd = Qt::CTRL + Qt::Key_C, shortcut_properties_win = Qt::CTRL + Qt::Key_P, shortcut_zoom_in = Qt::CTRL + Qt::Key_Plus,//QKeySequence(QKeySequence::ZoomIn), shortcut_zoom_out = Qt::CTRL + Qt::Key_Minus,//QKeySequence(QKeySequence::ZoomOut), @@ -518,6 +520,7 @@ public slots: void ZoomOut(); void saveView(); + void copy2Clipbrd(); protected: void contextMenuEvent(QContextMenuEvent* event) CV_OVERRIDE; diff --git a/modules/highgui/src/window_QT.qrc b/modules/highgui/src/window_QT.qrc index 7bcdc24541..efdd8c29a9 100644 --- a/modules/highgui/src/window_QT.qrc +++ b/modules/highgui/src/window_QT.qrc @@ -9,6 +9,7 @@ files_Qt/Milky/48/106.png files_Qt/Milky/48/107.png files_Qt/Milky/48/7.png + files_Qt/Milky/48/43.png files_Qt/Milky/48/38.png files_Qt/stylesheet_trackbar.qss From 1f48940a9328cda62bb0386468e9a2f7e11e36a5 Mon Sep 17 00:00:00 2001 From: jshiwam Date: Thu, 20 Feb 2020 16:29:17 +0530 Subject: [PATCH 3/6] bug fix to Inconsistent comment for EXIF issue id #15706 stream added final removed all the changes occured while trying to resolve conflicts --- modules/imgcodecs/src/exif.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/imgcodecs/src/exif.hpp b/modules/imgcodecs/src/exif.hpp index f7f4e383e9..dc9a58ab0b 100644 --- a/modules/imgcodecs/src/exif.hpp +++ b/modules/imgcodecs/src/exif.hpp @@ -154,7 +154,8 @@ enum ImageOrientation * Usage example for getting the orientation of the image: * * @code - * ExifReader reader(fileName); + * std::ifstream stream(filename,std::ios_base::in | std::ios_base::binary); + * ExifReader reader(stream); * if( reader.parse() ) * { * int orientation = reader.getTag(Orientation).field_u16; From 8c24af66bd18b6cadce602ac392b06a72fe229c1 Mon Sep 17 00:00:00 2001 From: Chip Kerchner <49959681+ChipKerchner@users.noreply.github.com> Date: Fri, 28 Feb 2020 11:34:06 -0500 Subject: [PATCH 4/6] Merge pull request #16556 from ChipKerchner:vectorizeIntegralSumPixels * Vectorize calculating integral for line for single and multiple channels * Single vector processing for 4-channels - 25-30% faster * Single vector processing for 4-channels - 25-30% faster * Fixed AVX512 code for 4 channels * Disable 3 channel 8UC1 to 32S for SSE2 and SSE3 (slower). Use new version of 8UC1 to 64F for AVX512. --- modules/imgproc/src/sumpixels.simd.hpp | 1057 +++++++++++++++++++++--- 1 file changed, 958 insertions(+), 99 deletions(-) diff --git a/modules/imgproc/src/sumpixels.simd.hpp b/modules/imgproc/src/sumpixels.simd.hpp index c8d60a0040..2ac02a0c3c 100644 --- a/modules/imgproc/src/sumpixels.simd.hpp +++ b/modules/imgproc/src/sumpixels.simd.hpp @@ -75,32 +75,6 @@ struct Integral_SIMD } }; -#if CV_AVX512_SKX -template <> -struct Integral_SIMD { - Integral_SIMD() {}; - - - bool operator()(const uchar *src, size_t _srcstep, - double *sum, size_t _sumstep, - double *sqsum, size_t _sqsumstep, - double *tilted, size_t _tiltedstep, - int width, int height, int cn) const - { - CV_UNUSED(_tiltedstep); - // TODO: Add support for 1 channel input (WIP) - if (!tilted && (cn <= 4)) - { - calculate_integral_avx512(src, _srcstep, sum, _sumstep, - sqsum, _sqsumstep, width, height, cn); - return true; - } - return false; - } - -}; -#endif - #if CV_SIMD && CV_SIMD_WIDTH <= 64 template <> @@ -114,57 +88,304 @@ struct Integral_SIMD int * tilted, size_t, int width, int height, int cn) const { - if (sqsum || tilted || cn != 1) + if (sqsum || tilted || cn > 4) return false; +#if !CV_SSE4_1 && CV_SSE2 + // 3 channel code is slower for SSE2 & SSE3 + if (cn == 3) + return false; +#endif + + width *= cn; // the first iteration - memset(sum, 0, (width + 1) * sizeof(int)); + memset(sum, 0, (width + cn) * sizeof(int)); - // the others - for (int i = 0; i < height; ++i) + if (cn == 1) { - const uchar * src_row = src + _srcstep * i; - int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + 1; - int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + 1; - - sum_row[-1] = 0; - - v_int32 prev = vx_setzero_s32(); - int j = 0; - for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes) + // the others + for (int i = 0; i < height; ++i) { - v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j)); - v_int32 el4l, el4h; + const uchar * src_row = src + _srcstep * i; + int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + 1; + int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + 1; + + sum_row[-1] = 0; + + v_int32 prev = vx_setzero_s32(); + int j = 0; + for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes) + { + v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j)); + v_int32 el4l, el4h; #if CV_AVX2 && CV_SIMD_WIDTH == 32 - __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2)); - vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4)); - vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8)); - __m256i shmask = _mm256_set1_epi32(7); - el4l.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum)), prev.val); - el4h.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum)), _mm256_permutevar8x32_epi32(el4l.val, shmask)); - prev.val = _mm256_permutevar8x32_epi32(el4h.val, shmask); + __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2)); + vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4)); + vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8)); + __m256i shmask = _mm256_set1_epi32(7); + el4l.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum)), prev.val); + el4h.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum)), _mm256_permutevar8x32_epi32(el4l.val, shmask)); + prev.val = _mm256_permutevar8x32_epi32(el4h.val, shmask); #else - el8 += v_rotate_left<1>(el8); - el8 += v_rotate_left<2>(el8); + el8 += v_rotate_left<1>(el8); + el8 += v_rotate_left<2>(el8); #if CV_SIMD_WIDTH >= 32 - el8 += v_rotate_left<4>(el8); + el8 += v_rotate_left<4>(el8); #if CV_SIMD_WIDTH == 64 - el8 += v_rotate_left<8>(el8); + el8 += v_rotate_left<8>(el8); #endif #endif - v_expand(el8, el4l, el4h); - el4l += prev; - el4h += el4l; + v_expand(el8, el4l, el4h); + el4l += prev; + el4h += el4l; + prev = v_broadcast_element(el4h); +#endif + v_store(sum_row + j , el4l + vx_load(prev_sum_row + j )); + v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes)); + } - prev = v_broadcast_element(el4h); -#endif - v_store(sum_row + j , el4l + vx_load(prev_sum_row + j )); - v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes)); + for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j) + sum_row[j] = (v += src_row[j]) + prev_sum_row[j]; } - - for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j) - sum_row[j] = (v += src_row[j]) + prev_sum_row[j]; } + else if (cn == 2) + { + // the others + v_int16 mask = vx_setall_s16((short)0xff); + for (int i = 0; i < height; ++i) + { + const uchar * src_row = src + _srcstep * i; + int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + cn; + int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + cn; + + sum_row[-1] = sum_row[-2] = 0; + + v_int32 prev_1 = vx_setzero_s32(), prev_2 = vx_setzero_s32(); + int j = 0; + for ( ; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn) + { + v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j)); + v_int16 el8_1 = v_src_row & mask; + v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8); + v_int32 el4l_1, el4h_1, el4l_2, el4h_2; +#if CV_AVX2 && CV_SIMD_WIDTH == 32 + __m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2)); + __m256i vsum_2 = _mm256_add_epi16(el8_2.val, _mm256_slli_si256(el8_2.val, 2)); + vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 4)); + vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 4)); + vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 8)); + vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 8)); + __m256i shmask = _mm256_set1_epi32(7); + el4l_1.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_1)), prev_1.val); + el4l_2.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_2)), prev_2.val); + el4h_1.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_1)), _mm256_permutevar8x32_epi32(el4l_1.val, shmask)); + el4h_2.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_2)), _mm256_permutevar8x32_epi32(el4l_2.val, shmask)); + prev_1.val = _mm256_permutevar8x32_epi32(el4h_1.val, shmask); + prev_2.val = _mm256_permutevar8x32_epi32(el4h_2.val, shmask); +#else + el8_1 += v_rotate_left<1>(el8_1); + el8_2 += v_rotate_left<1>(el8_2); + el8_1 += v_rotate_left<2>(el8_1); + el8_2 += v_rotate_left<2>(el8_2); +#if CV_SIMD_WIDTH >= 32 + el8_1 += v_rotate_left<4>(el8_1); + el8_2 += v_rotate_left<4>(el8_2); +#if CV_SIMD_WIDTH == 64 + el8_1 += v_rotate_left<8>(el8_1); + el8_2 += v_rotate_left<8>(el8_2); +#endif +#endif + v_expand(el8_1, el4l_1, el4h_1); + v_expand(el8_2, el4l_2, el4h_2); + el4l_1 += prev_1; + el4l_2 += prev_2; + el4h_1 += el4l_1; + el4h_2 += el4l_2; + prev_1 = v_broadcast_element(el4h_1); + prev_2 = v_broadcast_element(el4h_2); +#endif + v_int32 el4_1, el4_2, el4_3, el4_4; + v_zip(el4l_1, el4l_2, el4_1, el4_2); + v_zip(el4h_1, el4h_2, el4_3, el4_4); + v_store(sum_row + j , el4_1 + vx_load(prev_sum_row + j )); + v_store(sum_row + j + v_int32::nlanes , el4_2 + vx_load(prev_sum_row + j + v_int32::nlanes )); + v_store(sum_row + j + v_int32::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 2)); + v_store(sum_row + j + v_int32::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_int32::nlanes * 3)); + } + + for (int v2 = sum_row[j - 1] - prev_sum_row[j - 1], + v1 = sum_row[j - 2] - prev_sum_row[j - 2]; j < width; j += 2) + { + sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j]; + sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1]; + } + } + } +#if CV_SSE4_1 || !CV_SSE2 + else if (cn == 3) + { + // the others + for (int i = 0; i < height; ++i) + { + const uchar * src_row = src + _srcstep * i; + int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + cn; + int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + cn; + int row_cache[v_int32::nlanes * 6]; + + sum_row[-1] = sum_row[-2] = sum_row[-3] = 0; + + v_int32 prev_1 = vx_setzero_s32(), prev_2 = vx_setzero_s32(), + prev_3 = vx_setzero_s32(); + int j = 0; + for ( ; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn) + { + v_uint8 v_src_row_1, v_src_row_2, v_src_row_3; + v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3); + v_int16 el8_1 = v_reinterpret_as_s16(v_expand_low(v_src_row_1)); + v_int16 el8_2 = v_reinterpret_as_s16(v_expand_low(v_src_row_2)); + v_int16 el8_3 = v_reinterpret_as_s16(v_expand_low(v_src_row_3)); + v_int32 el4l_1, el4h_1, el4l_2, el4h_2, el4l_3, el4h_3; +#if CV_AVX2 && CV_SIMD_WIDTH == 32 + __m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2)); + __m256i vsum_2 = _mm256_add_epi16(el8_2.val, _mm256_slli_si256(el8_2.val, 2)); + __m256i vsum_3 = _mm256_add_epi16(el8_3.val, _mm256_slli_si256(el8_3.val, 2)); + vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 4)); + vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 4)); + vsum_3 = _mm256_add_epi16(vsum_3, _mm256_slli_si256(vsum_3, 4)); + vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 8)); + vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 8)); + vsum_3 = _mm256_add_epi16(vsum_3, _mm256_slli_si256(vsum_3, 8)); + __m256i shmask = _mm256_set1_epi32(7); + el4l_1.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_1)), prev_1.val); + el4l_2.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_2)), prev_2.val); + el4l_3.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_3)), prev_3.val); + el4h_1.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_1)), _mm256_permutevar8x32_epi32(el4l_1.val, shmask)); + el4h_2.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_2)), _mm256_permutevar8x32_epi32(el4l_2.val, shmask)); + el4h_3.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_3)), _mm256_permutevar8x32_epi32(el4l_3.val, shmask)); + prev_1.val = _mm256_permutevar8x32_epi32(el4h_1.val, shmask); + prev_2.val = _mm256_permutevar8x32_epi32(el4h_2.val, shmask); + prev_3.val = _mm256_permutevar8x32_epi32(el4h_3.val, shmask); +#else + el8_1 += v_rotate_left<1>(el8_1); + el8_2 += v_rotate_left<1>(el8_2); + el8_3 += v_rotate_left<1>(el8_3); + el8_1 += v_rotate_left<2>(el8_1); + el8_2 += v_rotate_left<2>(el8_2); + el8_3 += v_rotate_left<2>(el8_3); +#if CV_SIMD_WIDTH >= 32 + el8_1 += v_rotate_left<4>(el8_1); + el8_2 += v_rotate_left<4>(el8_2); + el8_3 += v_rotate_left<4>(el8_3); +#if CV_SIMD_WIDTH == 64 + el8_1 += v_rotate_left<8>(el8_1); + el8_2 += v_rotate_left<8>(el8_2); + el8_3 += v_rotate_left<8>(el8_3); +#endif +#endif + v_expand(el8_1, el4l_1, el4h_1); + v_expand(el8_2, el4l_2, el4h_2); + v_expand(el8_3, el4l_3, el4h_3); + el4l_1 += prev_1; + el4l_2 += prev_2; + el4l_3 += prev_3; + el4h_1 += el4l_1; + el4h_2 += el4l_2; + el4h_3 += el4l_3; + prev_1 = v_broadcast_element(el4h_1); + prev_2 = v_broadcast_element(el4h_2); + prev_3 = v_broadcast_element(el4h_3); +#endif + v_store_interleave(row_cache , el4l_1, el4l_2, el4l_3); + v_store_interleave(row_cache + v_int32::nlanes * 3, el4h_1, el4h_2, el4h_3); + el4l_1 = vx_load(row_cache ); + el4l_2 = vx_load(row_cache + v_int32::nlanes ); + el4l_3 = vx_load(row_cache + v_int32::nlanes * 2); + el4h_1 = vx_load(row_cache + v_int32::nlanes * 3); + el4h_2 = vx_load(row_cache + v_int32::nlanes * 4); + el4h_3 = vx_load(row_cache + v_int32::nlanes * 5); + v_store(sum_row + j , el4l_1 + vx_load(prev_sum_row + j )); + v_store(sum_row + j + v_int32::nlanes , el4l_2 + vx_load(prev_sum_row + j + v_int32::nlanes )); + v_store(sum_row + j + v_int32::nlanes * 2, el4l_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 2)); + v_store(sum_row + j + v_int32::nlanes * 3, el4h_1 + vx_load(prev_sum_row + j + v_int32::nlanes * 3)); + v_store(sum_row + j + v_int32::nlanes * 4, el4h_2 + vx_load(prev_sum_row + j + v_int32::nlanes * 4)); + v_store(sum_row + j + v_int32::nlanes * 5, el4h_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 5)); + } + + for (int v3 = sum_row[j - 1] - prev_sum_row[j - 1], + v2 = sum_row[j - 2] - prev_sum_row[j - 2], + v1 = sum_row[j - 3] - prev_sum_row[j - 3]; j < width; j += 3) + { + sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j]; + sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1]; + sum_row[j + 2] = (v3 += src_row[j + 2]) + prev_sum_row[j + 2]; + } + } + } +#endif + else if (cn == 4) + { + // the others + for (int i = 0; i < height; ++i) + { + const uchar * src_row = src + _srcstep * i; + int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + cn; + int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + cn; + + sum_row[-1] = sum_row[-2] = sum_row[-3] = sum_row[-4] = 0; + + v_int32 prev = vx_setzero_s32(); + int j = 0; + for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes) + { + v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j)); + v_int32 el4l, el4h; +#if CV_AVX2 && CV_SIMD_WIDTH == 32 + __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 8)); + el4l.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum)), prev.val); + el4h.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum)), _mm256_permute2x128_si256(el4l.val, el4l.val, 0x31)); + prev.val = _mm256_permute2x128_si256(el4h.val, el4h.val, 0x31); +#else +#if CV_SIMD_WIDTH >= 32 + el8 += v_rotate_left<4>(el8); +#if CV_SIMD_WIDTH == 64 + el8 += v_rotate_left<8>(el8); +#endif +#endif + v_expand(el8, el4l, el4h); + el4l += prev; + el4h += el4l; +#if CV_SIMD_WIDTH == 16 + prev = el4h; +#elif CV_SIMD_WIDTH == 32 + prev = v_combine_high(el4h, el4h); +#else + v_int32 t = v_rotate_right<12>(el4h); + t |= v_rotate_left<4>(t); + prev = v_combine_low(t, t); +#endif +#endif + v_store(sum_row + j , el4l + vx_load(prev_sum_row + j )); + v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes)); + } + + for (int v4 = sum_row[j - 1] - prev_sum_row[j - 1], + v3 = sum_row[j - 2] - prev_sum_row[j - 2], + v2 = sum_row[j - 3] - prev_sum_row[j - 3], + v1 = sum_row[j - 4] - prev_sum_row[j - 4]; j < width; j += 4) + { + sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j]; + sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1]; + sum_row[j + 2] = (v3 += src_row[j + 2]) + prev_sum_row[j + 2]; + sum_row[j + 3] = (v4 += src_row[j + 3]) + prev_sum_row[j + 3]; + } + } + } + else + { + return false; + } + vx_cleanup(); + return true; } }; @@ -180,62 +401,700 @@ struct Integral_SIMD float * tilted, size_t, int width, int height, int cn) const { - if (sqsum || tilted || cn != 1) + if (sqsum || tilted || cn > 4) return false; + width *= cn; + // the first iteration - memset(sum, 0, (width + 1) * sizeof(int)); + memset(sum, 0, (width + cn) * sizeof(float)); - // the others - for (int i = 0; i < height; ++i) + if (cn == 1) { - const uchar * src_row = src + _srcstep * i; - float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + 1; - float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + 1; - - sum_row[-1] = 0; - - v_float32 prev = vx_setzero_f32(); - int j = 0; - for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes) + // the others + for (int i = 0; i < height; ++i) { - v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j)); - v_float32 el4l, el4h; + const uchar * src_row = src + _srcstep * i; + float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + 1; + float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + 1; + + sum_row[-1] = 0; + + v_float32 prev = vx_setzero_f32(); + int j = 0; + for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes) + { + v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j)); + v_float32 el4l, el4h; #if CV_AVX2 && CV_SIMD_WIDTH == 32 - __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2)); - vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4)); - vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8)); - __m256i shmask = _mm256_set1_epi32(7); - el4l.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum))), prev.val); - el4h.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum))), _mm256_permutevar8x32_ps(el4l.val, shmask)); - prev.val = _mm256_permutevar8x32_ps(el4h.val, shmask); + __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2)); + vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4)); + vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8)); + __m256i shmask = _mm256_set1_epi32(7); + el4l.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum))), prev.val); + el4h.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum))), _mm256_permutevar8x32_ps(el4l.val, shmask)); + prev.val = _mm256_permutevar8x32_ps(el4h.val, shmask); #else - el8 += v_rotate_left<1>(el8); - el8 += v_rotate_left<2>(el8); + el8 += v_rotate_left<1>(el8); + el8 += v_rotate_left<2>(el8); #if CV_SIMD_WIDTH >= 32 - el8 += v_rotate_left<4>(el8); + el8 += v_rotate_left<4>(el8); #if CV_SIMD_WIDTH == 64 - el8 += v_rotate_left<8>(el8); + el8 += v_rotate_left<8>(el8); #endif #endif - v_int32 el4li, el4hi; - v_expand(el8, el4li, el4hi); - el4l = v_cvt_f32(el4li) + prev; - el4h = v_cvt_f32(el4hi) + el4l; + v_int32 el4li, el4hi; + v_expand(el8, el4li, el4hi); + el4l = v_cvt_f32(el4li) + prev; + el4h = v_cvt_f32(el4hi) + el4l; + prev = v_broadcast_element(el4h); +#endif + v_store(sum_row + j , el4l + vx_load(prev_sum_row + j )); + v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes)); + } - prev = v_broadcast_element(el4h); -#endif - v_store(sum_row + j , el4l + vx_load(prev_sum_row + j )); - v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes)); + for (float v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j) + sum_row[j] = (v += src_row[j]) + prev_sum_row[j]; } - - for (float v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j) - sum_row[j] = (v += src_row[j]) + prev_sum_row[j]; } + else if (cn == 2) + { + // the others + v_int16 mask = vx_setall_s16((short)0xff); + for (int i = 0; i < height; ++i) + { + const uchar * src_row = src + _srcstep * i; + float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + cn; + float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + cn; + + sum_row[-1] = sum_row[-2] = 0; + + v_float32 prev_1 = vx_setzero_f32(), prev_2 = vx_setzero_f32(); + int j = 0; + for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn) + { + v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j)); + v_int16 el8_1 = v_src_row & mask; + v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8); + v_float32 el4l_1, el4h_1, el4l_2, el4h_2; +#if CV_AVX2 && CV_SIMD_WIDTH == 32 + __m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2)); + __m256i vsum_2 = _mm256_add_epi16(el8_2.val, _mm256_slli_si256(el8_2.val, 2)); + vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 4)); + vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 4)); + vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 8)); + vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 8)); + __m256i shmask = _mm256_set1_epi32(7); + el4l_1.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_1))), prev_1.val); + el4l_2.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_2))), prev_2.val); + el4h_1.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_1))), _mm256_permutevar8x32_ps(el4l_1.val, shmask)); + el4h_2.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_2))), _mm256_permutevar8x32_ps(el4l_2.val, shmask)); + prev_1.val = _mm256_permutevar8x32_ps(el4h_1.val, shmask); + prev_2.val = _mm256_permutevar8x32_ps(el4h_2.val, shmask); +#else + el8_1 += v_rotate_left<1>(el8_1); + el8_2 += v_rotate_left<1>(el8_2); + el8_1 += v_rotate_left<2>(el8_1); + el8_2 += v_rotate_left<2>(el8_2); +#if CV_SIMD_WIDTH >= 32 + el8_1 += v_rotate_left<4>(el8_1); + el8_2 += v_rotate_left<4>(el8_2); +#if CV_SIMD_WIDTH == 64 + el8_1 += v_rotate_left<8>(el8_1); + el8_2 += v_rotate_left<8>(el8_2); +#endif +#endif + v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2; + v_expand(el8_1, el4li_1, el4hi_1); + v_expand(el8_2, el4li_2, el4hi_2); + el4l_1 = v_cvt_f32(el4li_1) + prev_1; + el4l_2 = v_cvt_f32(el4li_2) + prev_2; + el4h_1 = v_cvt_f32(el4hi_1) + el4l_1; + el4h_2 = v_cvt_f32(el4hi_2) + el4l_2; + prev_1 = v_broadcast_element(el4h_1); + prev_2 = v_broadcast_element(el4h_2); +#endif + v_float32 el4_1, el4_2, el4_3, el4_4; + v_zip(el4l_1, el4l_2, el4_1, el4_2); + v_zip(el4h_1, el4h_2, el4_3, el4_4); + v_store(sum_row + j , el4_1 + vx_load(prev_sum_row + j )); + v_store(sum_row + j + v_float32::nlanes , el4_2 + vx_load(prev_sum_row + j + v_float32::nlanes )); + v_store(sum_row + j + v_float32::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 2)); + v_store(sum_row + j + v_float32::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_float32::nlanes * 3)); + } + + for (float v2 = sum_row[j - 1] - prev_sum_row[j - 1], + v1 = sum_row[j - 2] - prev_sum_row[j - 2]; j < width; j += 2) + { + sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j]; + sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1]; + } + } + } + else if (cn == 3) + { + // the others + for (int i = 0; i < height; ++i) + { + const uchar * src_row = src + _srcstep * i; + float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + cn; + float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + cn; + float row_cache[v_float32::nlanes * 6]; + + sum_row[-1] = sum_row[-2] = sum_row[-3] = 0; + + v_float32 prev_1 = vx_setzero_f32(), prev_2 = vx_setzero_f32(), + prev_3 = vx_setzero_f32(); + int j = 0; + for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn) + { + v_uint8 v_src_row_1, v_src_row_2, v_src_row_3; + v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3); + v_int16 el8_1 = v_reinterpret_as_s16(v_expand_low(v_src_row_1)); + v_int16 el8_2 = v_reinterpret_as_s16(v_expand_low(v_src_row_2)); + v_int16 el8_3 = v_reinterpret_as_s16(v_expand_low(v_src_row_3)); + v_float32 el4l_1, el4h_1, el4l_2, el4h_2, el4l_3, el4h_3; +#if CV_AVX2 && CV_SIMD_WIDTH == 32 + __m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2)); + __m256i vsum_2 = _mm256_add_epi16(el8_2.val, _mm256_slli_si256(el8_2.val, 2)); + __m256i vsum_3 = _mm256_add_epi16(el8_3.val, _mm256_slli_si256(el8_3.val, 2)); + vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 4)); + vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 4)); + vsum_3 = _mm256_add_epi16(vsum_3, _mm256_slli_si256(vsum_3, 4)); + vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 8)); + vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 8)); + vsum_3 = _mm256_add_epi16(vsum_3, _mm256_slli_si256(vsum_3, 8)); + __m256i shmask = _mm256_set1_epi32(7); + el4l_1.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_1))), prev_1.val); + el4l_2.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_2))), prev_2.val); + el4l_3.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum_3))), prev_3.val); + el4h_1.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_1))), _mm256_permutevar8x32_ps(el4l_1.val, shmask)); + el4h_2.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_2))), _mm256_permutevar8x32_ps(el4l_2.val, shmask)); + el4h_3.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum_3))), _mm256_permutevar8x32_ps(el4l_3.val, shmask)); + prev_1.val = _mm256_permutevar8x32_ps(el4h_1.val, shmask); + prev_2.val = _mm256_permutevar8x32_ps(el4h_2.val, shmask); + prev_3.val = _mm256_permutevar8x32_ps(el4h_3.val, shmask); +#else + el8_1 += v_rotate_left<1>(el8_1); + el8_2 += v_rotate_left<1>(el8_2); + el8_3 += v_rotate_left<1>(el8_3); + el8_1 += v_rotate_left<2>(el8_1); + el8_2 += v_rotate_left<2>(el8_2); + el8_3 += v_rotate_left<2>(el8_3); +#if CV_SIMD_WIDTH >= 32 + el8_1 += v_rotate_left<4>(el8_1); + el8_2 += v_rotate_left<4>(el8_2); + el8_3 += v_rotate_left<4>(el8_3); +#if CV_SIMD_WIDTH == 64 + el8_1 += v_rotate_left<8>(el8_1); + el8_2 += v_rotate_left<8>(el8_2); + el8_3 += v_rotate_left<8>(el8_3); +#endif +#endif + v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2, el4li_3, el4hi_3; + v_expand(el8_1, el4li_1, el4hi_1); + v_expand(el8_2, el4li_2, el4hi_2); + v_expand(el8_3, el4li_3, el4hi_3); + el4l_1 = v_cvt_f32(el4li_1) + prev_1; + el4l_2 = v_cvt_f32(el4li_2) + prev_2; + el4l_3 = v_cvt_f32(el4li_3) + prev_3; + el4h_1 = v_cvt_f32(el4hi_1) + el4l_1; + el4h_2 = v_cvt_f32(el4hi_2) + el4l_2; + el4h_3 = v_cvt_f32(el4hi_3) + el4l_3; + prev_1 = v_broadcast_element(el4h_1); + prev_2 = v_broadcast_element(el4h_2); + prev_3 = v_broadcast_element(el4h_3); +#endif + v_store_interleave(row_cache , el4l_1, el4l_2, el4l_3); + v_store_interleave(row_cache + v_float32::nlanes * 3, el4h_1, el4h_2, el4h_3); + el4l_1 = vx_load(row_cache ); + el4l_2 = vx_load(row_cache + v_float32::nlanes ); + el4l_3 = vx_load(row_cache + v_float32::nlanes * 2); + el4h_1 = vx_load(row_cache + v_float32::nlanes * 3); + el4h_2 = vx_load(row_cache + v_float32::nlanes * 4); + el4h_3 = vx_load(row_cache + v_float32::nlanes * 5); + v_store(sum_row + j , el4l_1 + vx_load(prev_sum_row + j )); + v_store(sum_row + j + v_float32::nlanes , el4l_2 + vx_load(prev_sum_row + j + v_float32::nlanes )); + v_store(sum_row + j + v_float32::nlanes * 2, el4l_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 2)); + v_store(sum_row + j + v_float32::nlanes * 3, el4h_1 + vx_load(prev_sum_row + j + v_float32::nlanes * 3)); + v_store(sum_row + j + v_float32::nlanes * 4, el4h_2 + vx_load(prev_sum_row + j + v_float32::nlanes * 4)); + v_store(sum_row + j + v_float32::nlanes * 5, el4h_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 5)); + } + + for (float v3 = sum_row[j - 1] - prev_sum_row[j - 1], + v2 = sum_row[j - 2] - prev_sum_row[j - 2], + v1 = sum_row[j - 3] - prev_sum_row[j - 3]; j < width; j += 3) + { + sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j]; + sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1]; + sum_row[j + 2] = (v3 += src_row[j + 2]) + prev_sum_row[j + 2]; + } + } + } + else if (cn == 4) + { + // the others + for (int i = 0; i < height; ++i) + { + const uchar * src_row = src + _srcstep * i; + float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + cn; + float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + cn; + + sum_row[-1] = sum_row[-2] = sum_row[-3] = sum_row[-4] = 0; + + v_float32 prev = vx_setzero_f32(); + int j = 0; + for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes) + { + v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j)); + v_float32 el4l, el4h; +#if CV_AVX2 && CV_SIMD_WIDTH == 32 + __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 8)); + el4l.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum))), prev.val); + el4h.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum))), _mm256_permute2f128_ps(el4l.val, el4l.val, 0x31)); + prev.val = _mm256_permute2f128_ps(el4h.val, el4h.val, 0x31); +#else +#if CV_SIMD_WIDTH >= 32 + el8 += v_rotate_left<4>(el8); +#if CV_SIMD_WIDTH == 64 + el8 += v_rotate_left<8>(el8); +#endif +#endif + v_int32 el4li, el4hi; + v_expand(el8, el4li, el4hi); + el4l = v_cvt_f32(el4li) + prev; + el4h = v_cvt_f32(el4hi) + el4l; +#if CV_SIMD_WIDTH == 16 + prev = el4h; +#elif CV_SIMD_WIDTH == 32 + prev = v_combine_high(el4h, el4h); +#else + v_float32 t = v_rotate_right<12>(el4h); + t |= v_rotate_left<4>(t); + prev = v_combine_low(t, t); +#endif +#endif + v_store(sum_row + j , el4l + vx_load(prev_sum_row + j )); + v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes)); + } + + for (float v4 = sum_row[j - 1] - prev_sum_row[j - 1], + v3 = sum_row[j - 2] - prev_sum_row[j - 2], + v2 = sum_row[j - 3] - prev_sum_row[j - 3], + v1 = sum_row[j - 4] - prev_sum_row[j - 4]; j < width; j += 4) + { + sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j]; + sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1]; + sum_row[j + 2] = (v3 += src_row[j + 2]) + prev_sum_row[j + 2]; + sum_row[j + 3] = (v4 += src_row[j + 3]) + prev_sum_row[j + 3]; + } + } + } + else + { + return false; + } + vx_cleanup(); + return true; } }; +#if CV_SIMD128_64F +template <> +struct Integral_SIMD +{ + Integral_SIMD() {} + + bool operator()(const uchar * src, size_t _srcstep, + double * sum, size_t _sumstep, + double * sqsum, size_t _sqsumstep, + double * tilted, size_t, + int width, int height, int cn) const + { +#if CV_AVX512_SKX + if (!tilted && cn <= 4 && (cn > 1 || sqsum)) + { + calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height, cn); + return true; + } +#else + CV_UNUSED(_sqsumstep); +#endif + if (sqsum || tilted || cn > 4) + return false; + + width *= cn; + + // the first iteration + memset(sum, 0, (width + cn) * sizeof(double)); + + if (cn == 1) + { + // the others + for (int i = 0; i < height; ++i) + { + const uchar * src_row = src + _srcstep * i; + double * prev_sum_row = (double *)((uchar *)sum + _sumstep * i) + 1; + double * sum_row = (double *)((uchar *)sum + _sumstep * (i + 1)) + 1; + + sum_row[-1] = 0; + + v_float64 prev = vx_setzero_f64(); + int j = 0; + for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes) + { + v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j)); + v_float64 el4ll, el4lh, el4hl, el4hh; +#if CV_AVX2 && CV_SIMD_WIDTH == 32 + __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2)); + vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4)); + vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8)); + __m256i el4l_32 = _mm256_cvtepi16_epi32(_v256_extract_low(vsum)); + __m256i el4h_32 = _mm256_cvtepi16_epi32(_v256_extract_high(vsum)); + el4ll.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4l_32)), prev.val); + el4lh.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4l_32)), prev.val); + __m256d el4d = _mm256_permute4x64_pd(el4lh.val, 0xff); + el4hl.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4h_32)), el4d); + el4hh.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h_32)), el4d); + prev.val = _mm256_permute4x64_pd(el4hh.val, 0xff); +#else + el8 += v_rotate_left<1>(el8); + el8 += v_rotate_left<2>(el8); +#if CV_SIMD_WIDTH >= 32 + el8 += v_rotate_left<4>(el8); +#if CV_SIMD_WIDTH == 64 + el8 += v_rotate_left<8>(el8); +#endif +#endif + v_int32 el4li, el4hi; + v_expand(el8, el4li, el4hi); + el4ll = v_cvt_f64(el4li) + prev; + el4lh = v_cvt_f64_high(el4li) + prev; + el4hl = v_cvt_f64(el4hi) + el4ll; + el4hh = v_cvt_f64_high(el4hi) + el4lh; + prev = vx_setall_f64(v_extract_n(el4hh)); +// prev = v_broadcast_element(el4hh); +#endif + v_store(sum_row + j , el4ll + vx_load(prev_sum_row + j )); + v_store(sum_row + j + v_float64::nlanes , el4lh + vx_load(prev_sum_row + j + v_float64::nlanes )); + v_store(sum_row + j + v_float64::nlanes * 2, el4hl + vx_load(prev_sum_row + j + v_float64::nlanes * 2)); + v_store(sum_row + j + v_float64::nlanes * 3, el4hh + vx_load(prev_sum_row + j + v_float64::nlanes * 3)); + } + + for (double v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j) + sum_row[j] = (v += src_row[j]) + prev_sum_row[j]; + } + } + else if (cn == 2) + { + // the others + v_int16 mask = vx_setall_s16((short)0xff); + for (int i = 0; i < height; ++i) + { + const uchar * src_row = src + _srcstep * i; + double * prev_sum_row = (double *)((uchar *)sum + _sumstep * i) + cn; + double * sum_row = (double *)((uchar *)sum + _sumstep * (i + 1)) + cn; + + sum_row[-1] = sum_row[-2] = 0; + + v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64(); + int j = 0; + for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn) + { + v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j)); + v_int16 el8_1 = v_src_row & mask; + v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8); + v_float64 el4ll_1, el4lh_1, el4hl_1, el4hh_1, el4ll_2, el4lh_2, el4hl_2, el4hh_2; +#if CV_AVX2 && CV_SIMD_WIDTH == 32 + __m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2)); + __m256i vsum_2 = _mm256_add_epi16(el8_2.val, _mm256_slli_si256(el8_2.val, 2)); + vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 4)); + vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 4)); + vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 8)); + vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 8)); + __m256i el4l1_32 = _mm256_cvtepi16_epi32(_v256_extract_low(vsum_1)); + __m256i el4l2_32 = _mm256_cvtepi16_epi32(_v256_extract_low(vsum_2)); + __m256i el4h1_32 = _mm256_cvtepi16_epi32(_v256_extract_high(vsum_1)); + __m256i el4h2_32 = _mm256_cvtepi16_epi32(_v256_extract_high(vsum_2)); + el4ll_1.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4l1_32)), prev_1.val); + el4ll_2.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4l2_32)), prev_2.val); + el4lh_1.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4l1_32)), prev_1.val); + el4lh_2.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4l2_32)), prev_2.val); + __m256d el4d_1 = _mm256_permute4x64_pd(el4lh_1.val, 0xff); + __m256d el4d_2 = _mm256_permute4x64_pd(el4lh_2.val, 0xff); + el4hl_1.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4h1_32)), el4d_1); + el4hl_2.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4h2_32)), el4d_2); + el4hh_1.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h1_32)), el4d_1); + el4hh_2.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h2_32)), el4d_2); + prev_1.val = _mm256_permute4x64_pd(el4hh_1.val, 0xff); + prev_2.val = _mm256_permute4x64_pd(el4hh_2.val, 0xff); +#else + el8_1 += v_rotate_left<1>(el8_1); + el8_2 += v_rotate_left<1>(el8_2); + el8_1 += v_rotate_left<2>(el8_1); + el8_2 += v_rotate_left<2>(el8_2); +#if CV_SIMD_WIDTH >= 32 + el8_1 += v_rotate_left<4>(el8_1); + el8_2 += v_rotate_left<4>(el8_2); +#if CV_SIMD_WIDTH == 64 + el8_1 += v_rotate_left<8>(el8_1); + el8_2 += v_rotate_left<8>(el8_2); +#endif +#endif + v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2; + v_expand(el8_1, el4li_1, el4hi_1); + v_expand(el8_2, el4li_2, el4hi_2); + el4ll_1 = v_cvt_f64(el4li_1) + prev_1; + el4ll_2 = v_cvt_f64(el4li_2) + prev_2; + el4lh_1 = v_cvt_f64_high(el4li_1) + prev_1; + el4lh_2 = v_cvt_f64_high(el4li_2) + prev_2; + el4hl_1 = v_cvt_f64(el4hi_1) + el4ll_1; + el4hl_2 = v_cvt_f64(el4hi_2) + el4ll_2; + el4hh_1 = v_cvt_f64_high(el4hi_1) + el4lh_1; + el4hh_2 = v_cvt_f64_high(el4hi_2) + el4lh_2; + prev_1 = vx_setall_f64(v_extract_n(el4hh_1)); + prev_2 = vx_setall_f64(v_extract_n(el4hh_2)); +// prev_1 = v_broadcast_element(el4hh_1); +// prev_2 = v_broadcast_element(el4hh_2); +#endif + v_float64 el4_1, el4_2, el4_3, el4_4, el4_5, el4_6, el4_7, el4_8; + v_zip(el4ll_1, el4ll_2, el4_1, el4_2); + v_zip(el4lh_1, el4lh_2, el4_3, el4_4); + v_zip(el4hl_1, el4hl_2, el4_5, el4_6); + v_zip(el4hh_1, el4hh_2, el4_7, el4_8); + v_store(sum_row + j , el4_1 + vx_load(prev_sum_row + j )); + v_store(sum_row + j + v_float64::nlanes , el4_2 + vx_load(prev_sum_row + j + v_float64::nlanes )); + v_store(sum_row + j + v_float64::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 2)); + v_store(sum_row + j + v_float64::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_float64::nlanes * 3)); + v_store(sum_row + j + v_float64::nlanes * 4, el4_5 + vx_load(prev_sum_row + j + v_float64::nlanes * 4)); + v_store(sum_row + j + v_float64::nlanes * 5, el4_6 + vx_load(prev_sum_row + j + v_float64::nlanes * 5)); + v_store(sum_row + j + v_float64::nlanes * 6, el4_7 + vx_load(prev_sum_row + j + v_float64::nlanes * 6)); + v_store(sum_row + j + v_float64::nlanes * 7, el4_8 + vx_load(prev_sum_row + j + v_float64::nlanes * 7)); + } + + for (double v2 = sum_row[j - 1] - prev_sum_row[j - 1], + v1 = sum_row[j - 2] - prev_sum_row[j - 2]; j < width; j += 2) + { + sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j]; + sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1]; + } + } + } + else if (cn == 3) + { + // the others + for (int i = 0; i < height; ++i) + { + const uchar * src_row = src + _srcstep * i; + double * prev_sum_row = (double *)((uchar *)sum + _sumstep * i) + cn; + double * sum_row = (double *)((uchar *)sum + _sumstep * (i + 1)) + cn; + double row_cache[v_float64::nlanes * 12]; + + sum_row[-1] = sum_row[-2] = sum_row[-3] = 0; + + v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64(), + prev_3 = vx_setzero_f64(); + int j = 0; + for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn) + { + v_uint8 v_src_row_1, v_src_row_2, v_src_row_3; + v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3); + v_int16 el8_1 = v_reinterpret_as_s16(v_expand_low(v_src_row_1)); + v_int16 el8_2 = v_reinterpret_as_s16(v_expand_low(v_src_row_2)); + v_int16 el8_3 = v_reinterpret_as_s16(v_expand_low(v_src_row_3)); + v_float64 el4ll_1, el4lh_1, el4hl_1, el4hh_1, el4ll_2, el4lh_2, el4hl_2, el4hh_2, el4ll_3, el4lh_3, el4hl_3, el4hh_3; +#if CV_AVX2 && CV_SIMD_WIDTH == 32 + __m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2)); + __m256i vsum_2 = _mm256_add_epi16(el8_2.val, _mm256_slli_si256(el8_2.val, 2)); + __m256i vsum_3 = _mm256_add_epi16(el8_3.val, _mm256_slli_si256(el8_3.val, 2)); + vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 4)); + vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 4)); + vsum_3 = _mm256_add_epi16(vsum_3, _mm256_slli_si256(vsum_3, 4)); + vsum_1 = _mm256_add_epi16(vsum_1, _mm256_slli_si256(vsum_1, 8)); + vsum_2 = _mm256_add_epi16(vsum_2, _mm256_slli_si256(vsum_2, 8)); + vsum_3 = _mm256_add_epi16(vsum_3, _mm256_slli_si256(vsum_3, 8)); + __m256i el4l1_32 = _mm256_cvtepi16_epi32(_v256_extract_low(vsum_1)); + __m256i el4l2_32 = _mm256_cvtepi16_epi32(_v256_extract_low(vsum_2)); + __m256i el4l3_32 = _mm256_cvtepi16_epi32(_v256_extract_low(vsum_3)); + __m256i el4h1_32 = _mm256_cvtepi16_epi32(_v256_extract_high(vsum_1)); + __m256i el4h2_32 = _mm256_cvtepi16_epi32(_v256_extract_high(vsum_2)); + __m256i el4h3_32 = _mm256_cvtepi16_epi32(_v256_extract_high(vsum_3)); + el4ll_1.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4l1_32)), prev_1.val); + el4ll_2.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4l2_32)), prev_2.val); + el4ll_3.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4l3_32)), prev_3.val); + el4lh_1.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4l1_32)), prev_1.val); + el4lh_2.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4l2_32)), prev_2.val); + el4lh_3.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4l3_32)), prev_3.val); + __m256d el4d_1 = _mm256_permute4x64_pd(el4lh_1.val, 0xff); + __m256d el4d_2 = _mm256_permute4x64_pd(el4lh_2.val, 0xff); + __m256d el4d_3 = _mm256_permute4x64_pd(el4lh_3.val, 0xff); + el4hl_1.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4h1_32)), el4d_1); + el4hl_2.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4h2_32)), el4d_2); + el4hl_3.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4h3_32)), el4d_3); + el4hh_1.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h1_32)), el4d_1); + el4hh_2.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h2_32)), el4d_2); + el4hh_3.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h3_32)), el4d_3); + prev_1.val = _mm256_permute4x64_pd(el4hh_1.val, 0xff); + prev_2.val = _mm256_permute4x64_pd(el4hh_2.val, 0xff); + prev_3.val = _mm256_permute4x64_pd(el4hh_3.val, 0xff); +#else + el8_1 += v_rotate_left<1>(el8_1); + el8_2 += v_rotate_left<1>(el8_2); + el8_3 += v_rotate_left<1>(el8_3); + el8_1 += v_rotate_left<2>(el8_1); + el8_2 += v_rotate_left<2>(el8_2); + el8_3 += v_rotate_left<2>(el8_3); +#if CV_SIMD_WIDTH >= 32 + el8_1 += v_rotate_left<4>(el8_1); + el8_2 += v_rotate_left<4>(el8_2); + el8_3 += v_rotate_left<4>(el8_3); +#if CV_SIMD_WIDTH == 64 + el8_1 += v_rotate_left<8>(el8_1); + el8_2 += v_rotate_left<8>(el8_2); + el8_3 += v_rotate_left<8>(el8_3); +#endif +#endif + v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2, el4li_3, el4hi_3; + v_expand(el8_1, el4li_1, el4hi_1); + v_expand(el8_2, el4li_2, el4hi_2); + v_expand(el8_3, el4li_3, el4hi_3); + el4ll_1 = v_cvt_f64(el4li_1) + prev_1; + el4ll_2 = v_cvt_f64(el4li_2) + prev_2; + el4ll_3 = v_cvt_f64(el4li_3) + prev_3; + el4lh_1 = v_cvt_f64_high(el4li_1) + prev_1; + el4lh_2 = v_cvt_f64_high(el4li_2) + prev_2; + el4lh_3 = v_cvt_f64_high(el4li_3) + prev_3; + el4hl_1 = v_cvt_f64(el4hi_1) + el4ll_1; + el4hl_2 = v_cvt_f64(el4hi_2) + el4ll_2; + el4hl_3 = v_cvt_f64(el4hi_3) + el4ll_3; + el4hh_1 = v_cvt_f64_high(el4hi_1) + el4lh_1; + el4hh_2 = v_cvt_f64_high(el4hi_2) + el4lh_2; + el4hh_3 = v_cvt_f64_high(el4hi_3) + el4lh_3; + prev_1 = vx_setall_f64(v_extract_n(el4hh_1)); + prev_2 = vx_setall_f64(v_extract_n(el4hh_2)); + prev_3 = vx_setall_f64(v_extract_n(el4hh_3)); +// prev_1 = v_broadcast_element(el4hh_1); +// prev_2 = v_broadcast_element(el4hh_2); +// prev_3 = v_broadcast_element(el4hh_3); +#endif + v_store_interleave(row_cache , el4ll_1, el4ll_2, el4ll_3); + v_store_interleave(row_cache + v_float64::nlanes * 3, el4lh_1, el4lh_2, el4lh_3); + v_store_interleave(row_cache + v_float64::nlanes * 6, el4hl_1, el4hl_2, el4hl_3); + v_store_interleave(row_cache + v_float64::nlanes * 9, el4hh_1, el4hh_2, el4hh_3); + el4ll_1 = vx_load(row_cache ); + el4ll_2 = vx_load(row_cache + v_float64::nlanes ); + el4ll_3 = vx_load(row_cache + v_float64::nlanes * 2 ); + el4lh_1 = vx_load(row_cache + v_float64::nlanes * 3 ); + el4lh_2 = vx_load(row_cache + v_float64::nlanes * 4 ); + el4lh_3 = vx_load(row_cache + v_float64::nlanes * 5 ); + el4hl_1 = vx_load(row_cache + v_float64::nlanes * 6 ); + el4hl_2 = vx_load(row_cache + v_float64::nlanes * 7 ); + el4hl_3 = vx_load(row_cache + v_float64::nlanes * 8 ); + el4hh_1 = vx_load(row_cache + v_float64::nlanes * 9 ); + el4hh_2 = vx_load(row_cache + v_float64::nlanes * 10); + el4hh_3 = vx_load(row_cache + v_float64::nlanes * 11); + v_store(sum_row + j , el4ll_1 + vx_load(prev_sum_row + j )); + v_store(sum_row + j + v_float64::nlanes , el4ll_2 + vx_load(prev_sum_row + j + v_float64::nlanes )); + v_store(sum_row + j + v_float64::nlanes * 2 , el4ll_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 2 )); + v_store(sum_row + j + v_float64::nlanes * 3 , el4lh_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 3 )); + v_store(sum_row + j + v_float64::nlanes * 4 , el4lh_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 4 )); + v_store(sum_row + j + v_float64::nlanes * 5 , el4lh_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 5 )); + v_store(sum_row + j + v_float64::nlanes * 6 , el4hl_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 6 )); + v_store(sum_row + j + v_float64::nlanes * 7 , el4hl_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 7 )); + v_store(sum_row + j + v_float64::nlanes * 8 , el4hl_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 8 )); + v_store(sum_row + j + v_float64::nlanes * 9 , el4hh_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 9 )); + v_store(sum_row + j + v_float64::nlanes * 10, el4hh_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 10)); + v_store(sum_row + j + v_float64::nlanes * 11, el4hh_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 11)); + } + + for (double v3 = sum_row[j - 1] - prev_sum_row[j - 1], + v2 = sum_row[j - 2] - prev_sum_row[j - 2], + v1 = sum_row[j - 3] - prev_sum_row[j - 3]; j < width; j += 3) + { + sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j]; + sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1]; + sum_row[j + 2] = (v3 += src_row[j + 2]) + prev_sum_row[j + 2]; + } + } + } + else if (cn == 4) + { + // the others + for (int i = 0; i < height; ++i) + { + const uchar * src_row = src + _srcstep * i; + double * prev_sum_row = (double *)((uchar *)sum + _sumstep * i) + cn; + double * sum_row = (double *)((uchar *)sum + _sumstep * (i + 1)) + cn; + + sum_row[-1] = sum_row[-2] = sum_row[-3] = sum_row[-4] = 0; + + v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64(); + int j = 0; + for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes) + { + v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j)); + v_float64 el4ll, el4lh, el4hl, el4hh; +#if CV_AVX2 && CV_SIMD_WIDTH == 32 + __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 8)); + __m256i el4l_32 = _mm256_cvtepi16_epi32(_v256_extract_low(vsum)); + __m256i el4h_32 = _mm256_cvtepi16_epi32(_v256_extract_high(vsum)); + el4ll.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4l_32)), prev_1.val); + el4lh.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4l_32)), prev_2.val); + el4hl.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_low(el4h_32)), el4lh.val); + el4hh.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h_32)), el4lh.val); + prev_1.val = prev_2.val = el4hh.val; +#else +#if CV_SIMD_WIDTH >= 32 + el8 += v_rotate_left<4>(el8); +#if CV_SIMD_WIDTH == 64 + el8 += v_rotate_left<8>(el8); +#endif +#endif + v_int32 el4li, el4hi; + v_expand(el8, el4li, el4hi); + el4ll = v_cvt_f64(el4li) + prev_1; + el4lh = v_cvt_f64_high(el4li) + prev_2; + el4hl = v_cvt_f64(el4hi) + el4ll; + el4hh = v_cvt_f64_high(el4hi) + el4lh; +#if CV_SIMD_WIDTH == 16 + prev_1 = el4hl; + prev_2 = el4hh; +#elif CV_SIMD_WIDTH == 32 + prev_1 = prev_2 = el4hh; +#else + prev_1 = prev_2 = v_combine_high(el4hh, el4hh); +#endif +#endif + v_store(sum_row + j , el4ll + vx_load(prev_sum_row + j )); + v_store(sum_row + j + v_float64::nlanes , el4lh + vx_load(prev_sum_row + j + v_float64::nlanes )); + v_store(sum_row + j + v_float64::nlanes * 2, el4hl + vx_load(prev_sum_row + j + v_float64::nlanes * 2)); + v_store(sum_row + j + v_float64::nlanes * 3, el4hh + vx_load(prev_sum_row + j + v_float64::nlanes * 3)); + } + + for (double v4 = sum_row[j - 1] - prev_sum_row[j - 1], + v3 = sum_row[j - 2] - prev_sum_row[j - 2], + v2 = sum_row[j - 3] - prev_sum_row[j - 3], + v1 = sum_row[j - 4] - prev_sum_row[j - 4]; j < width; j += 4) + { + sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j]; + sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1]; + sum_row[j + 2] = (v3 += src_row[j + 2]) + prev_sum_row[j + 2]; + sum_row[j + 3] = (v4 += src_row[j + 3]) + prev_sum_row[j + 3]; + } + } + } + else + { + return false; + } + vx_cleanup(); + + return true; + } +}; +#endif + #endif } // namespace anon From f00607c8ac3b567873c96a331ab3be6586c6c16f Mon Sep 17 00:00:00 2001 From: firebladed <34522909+firebladed@users.noreply.github.com> Date: Fri, 28 Feb 2020 18:17:30 +0000 Subject: [PATCH 5/6] Merge pull request #16626 from firebladed:firebladed-v4l2_pix_fmt_y12 V4L2: Add V4L2_PIX_FMT_Y12 (12 bit grey) support --- modules/videoio/src/cap_v4l.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/modules/videoio/src/cap_v4l.cpp b/modules/videoio/src/cap_v4l.cpp index d289aa0a9b..dc82e84571 100644 --- a/modules/videoio/src/cap_v4l.cpp +++ b/modules/videoio/src/cap_v4l.cpp @@ -262,6 +262,10 @@ make & enjoy! #define V4L2_PIX_FMT_Y10 v4l2_fourcc('Y', '1', '0', ' ') #endif +#ifndef V4L2_PIX_FMT_Y12 +#define V4L2_PIX_FMT_Y12 v4l2_fourcc('Y', '1', '2', ' ') +#endif + /* Defaults - If your board can do better, set it here. Set for the most common type inputs. */ #define DEFAULT_V4L_WIDTH 640 #define DEFAULT_V4L_HEIGHT 480 @@ -561,6 +565,7 @@ bool CvCaptureCAM_V4L::autosetup_capture_mode_v4l2() V4L2_PIX_FMT_JPEG, #endif V4L2_PIX_FMT_Y16, + V4L2_PIX_FMT_Y12, V4L2_PIX_FMT_Y10, V4L2_PIX_FMT_GREY, }; @@ -654,6 +659,7 @@ void CvCaptureCAM_V4L::v4l2_create_frame() size.height = size.height * 3 / 2; // "1.5" channels break; case V4L2_PIX_FMT_Y16: + case V4L2_PIX_FMT_Y12: case V4L2_PIX_FMT_Y10: depth = IPL_DEPTH_16U; /* fallthru */ @@ -1578,6 +1584,13 @@ void CvCaptureCAM_V4L::convertToRgb(const Buffer ¤tBuffer) cv::cvtColor(temp, destination, COLOR_GRAY2BGR); return; } + case V4L2_PIX_FMT_Y12: + { + cv::Mat temp(imageSize, CV_8UC1, buffers[MAX_V4L_BUFFERS].start); + cv::Mat(imageSize, CV_16UC1, currentBuffer.start).convertTo(temp, CV_8U, 1.0 / 16); + cv::cvtColor(temp, destination, COLOR_GRAY2BGR); + return; + } case V4L2_PIX_FMT_Y10: { cv::Mat temp(imageSize, CV_8UC1, buffers[MAX_V4L_BUFFERS].start); From 5012fc5d23eba9d6e75a521ea01f096f7b7d4c9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20W=C3=BCrtz?= Date: Fri, 28 Feb 2020 19:21:03 +0100 Subject: [PATCH 6/6] Merge pull request #16684 from pwuertz:ignore_clang_mat_inl * Ignore clang warnings for deprecated enum+enum operations in mat.inl.hpp * build: added customization macros, cmake flags for OpenCV build --- cmake/OpenCVCompilerOptions.cmake | 4 ++++ modules/core/include/opencv2/core/mat.inl.hpp | 20 +++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index eafca64068..476156f256 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -151,6 +151,10 @@ if(CV_GCC OR CV_CLANG) if(CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) add_extra_compiler_option(-Wno-missing-field-initializers) # GCC 4.x emits warnings about {}, fixed in GCC 5+ endif() + if(CV_CLANG AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10.0) + add_extra_compiler_option(-Wno-deprecated-enum-enum-conversion) + add_extra_compiler_option(-Wno-deprecated-anon-enum-enum-conversion) + endif() endif() add_extra_compiler_option(-fdiagnostics-show-option) diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp index 4d8d6f059d..f2efe1c820 100644 --- a/modules/core/include/opencv2/core/mat.inl.hpp +++ b/modules/core/include/opencv2/core/mat.inl.hpp @@ -54,6 +54,21 @@ #pragma warning( disable: 4127 ) #endif +#if defined(CV_SKIP_DISABLE_CLANG_ENUM_WARNINGS) + // nothing +#elif defined(CV_FORCE_DISABLE_CLANG_ENUM_WARNINGS) + #define CV_DISABLE_CLANG_ENUM_WARNINGS +#elif defined(__clang__) && defined(__has_warning) + #if __has_warning("-Wdeprecated-enum-enum-conversion") && __has_warning("-Wdeprecated-anon-enum-enum-conversion") + #define CV_DISABLE_CLANG_ENUM_WARNINGS + #endif +#endif +#ifdef CV_DISABLE_CLANG_ENUM_WARNINGS +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-enum-enum-conversion" +#pragma clang diagnostic ignored "-Wdeprecated-anon-enum-enum-conversion" +#endif + namespace cv { CV__DEBUG_NS_BEGIN @@ -4034,4 +4049,9 @@ inline void UMatData::markDeviceCopyObsolete(bool flag) #pragma warning( pop ) #endif +#ifdef CV_DISABLE_CLANG_ENUM_WARNINGS +#undef CV_DISABLE_CLANG_ENUM_WARNINGS +#pragma clang diagnostic pop +#endif + #endif