// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // // Copyright (C) 2019-2020, Intel Corporation, all rights reserved. #include "opencv2/core/hal/intrin.hpp" namespace cv { namespace hal { CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace { // Anonymous namespace to avoid exposing the implementation classes // // NOTE: Look at the bottom of the file for the entry-point function for external callers // template class IntegralCalculator; template class IntegralCalculator { public: IntegralCalculator() {}; void calculate_integral_avx512(const uchar *src, size_t _srcstep, double *sum, size_t _sumstep, double *sqsum, size_t _sqsumstep, int width, int height) { const int srcstep = (int)(_srcstep/sizeof(uchar)); const int sumstep = (int)(_sumstep/sizeof(double)); const int sqsumstep = (int)(_sqsumstep/sizeof(double)); const int ops_per_line = width * num_channels; // Clear the first line of the sum as per spec (see integral documentation) // Also adjust the index of sum and sqsum to be at the real 0th element // and not point to the border pixel so it stays in sync with the src pointer memset( sum, 0, (ops_per_line+num_channels)*sizeof(double)); sum += num_channels; if (sqsum) { memset( sqsum, 0, (ops_per_line+num_channels)*sizeof(double)); sqsum += num_channels; } // Now calculate the integral over the whole image one line at a time for(int y = 0; y < height; y++) { const uchar * src_line = &src[y*srcstep]; double * sum_above = &sum[y*sumstep]; double * sum_line = &sum_above[sumstep]; double * sqsum_above = (sqsum) ? &sqsum[y*sqsumstep] : NULL; double * sqsum_line = (sqsum) ? &sqsum_above[sqsumstep] : NULL; calculate_integral_for_line(src_line, sum_line, sum_above, sqsum_line, sqsum_above, ops_per_line); } } static CV_ALWAYS_INLINE void calculate_integral_for_line(const uchar *srcs, double *sums, double *sums_above, double *sqsums, double *sqsums_above, int num_ops_in_line) { __m512i sum_accumulator = _mm512_setzero_si512(); // holds rolling sums for the line __m512i sqsum_accumulator = _mm512_setzero_si512(); // holds rolling sqsums for the line // The first element on each line must be zeroes as per spec (see integral documentation) zero_out_border_pixel(sums, sqsums); // Do all 64 byte chunk operations then do the last bits that don't fit in a 64 byte chunk aligned_integral( srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line); post_aligned_integral(srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line); } static CV_ALWAYS_INLINE void zero_out_border_pixel(double *sums, double *sqsums) { // Note the negative index is because the sums/sqsums pointers point to the first real pixel // after the border pixel so we have to look backwards _mm512_mask_storeu_epi64(&sums[-(ptrdiff_t)num_channels], (1<> 6; // quick int divide by 64 for (int index_64byte_chunk = 0; index_64byte_chunk < num_chunks; index_64byte_chunk++){ integral_64_operations_avx512((__m512i *) srcs, (__m512i *) sums, (__m512i *) sums_above, (__m512i *) sqsum, (__m512i *) sqsum_above, 0xFFFFFFFFFFFFFFFF, sum_accumulator, sqsum_accumulator); srcs+=64; sums+=64; sums_above+=64; if (sqsum){ sqsum+= 64; sqsum_above+=64; } } } static CV_ALWAYS_INLINE void post_aligned_integral(const uchar *srcs, const double *sums, const double *sums_above, const double *sqsum, const double *sqsum_above, __m512i &sum_accumulator, __m512i &sqsum_accumulator, int num_ops_in_line) { // This function handles the last few straggling operations that are not a full chunk of 64 operations // We use the same algorithm, but we calculate a different operation mask using (num_ops % 64). const unsigned int num_operations = (unsigned int) num_ops_in_line & 0x3F; // Quick int modulo 64 if (num_operations > 0) { __mmask64 operation_mask = (1ULL << num_operations) - 1ULL; integral_64_operations_avx512((__m512i *) srcs, (__m512i *) sums, (__m512i *) sums_above, (__m512i *) sqsum, (__m512i *) sqsum_above, operation_mask, sum_accumulator, sqsum_accumulator); } } static CV_ALWAYS_INLINE void integral_64_operations_avx512(const __m512i *srcs, __m512i *sums, const __m512i *sums_above, __m512i *sqsums, const __m512i *sqsums_above, __mmask64 data_mask, __m512i &sum_accumulator, __m512i &sqsum_accumulator) { __m512i src_64byte_chunk = read_64_bytes(srcs, data_mask); while (data_mask) { __m128i src_16bytes = extract_lower_16bytes(src_64byte_chunk); __m512i src_longs_lo = convert_lower_8bytes_to_longs(src_16bytes); __m512i src_longs_hi = convert_lower_8bytes_to_longs(shift_right_8_bytes(src_16bytes)); // Calculate integral for the sum on the 8 lanes at a time integral_8_operations(src_longs_lo, sums_above, data_mask, sums, sum_accumulator); integral_8_operations(src_longs_hi, sums_above+1, data_mask>>8, sums+1, sum_accumulator); if (sqsums) { __m512i squared_source_lo = square_m512(src_longs_lo); __m512i squared_source_hi = square_m512(src_longs_hi); integral_8_operations(squared_source_lo, sqsums_above, data_mask, sqsums, sqsum_accumulator); integral_8_operations(squared_source_hi, sqsums_above+1, data_mask>>8, sqsums+1, sqsum_accumulator); sqsums += 2; sqsums_above+=2; } // Prepare for next iteration of loop // shift source to align next 16 bytes to lane 0, shift the mask, and advance the pointers sums += 2; sums_above += 2; data_mask = data_mask >> 16; src_64byte_chunk = shift_right_16_bytes(src_64byte_chunk); } } static CV_ALWAYS_INLINE void integral_8_operations(const __m512i src_longs, const __m512i *above_values_ptr, __mmask64 data_mask, __m512i *results_ptr, __m512i &accumulator) { // NOTE that the calculate_integral function referenced here must be implemented in the templated // derivatives because the algorithm depends heavily on the number of channels in the image // _mm512_mask_storeu_pd( results_ptr, // Store the result here (__mmask8)data_mask, // Using the data mask to avoid overrunning the line calculate_integral( // Writing the value of the integral derived from: src_longs, // input data _mm512_maskz_loadu_pd((__mmask8)data_mask, above_values_ptr), // and the results from line above accumulator // keeping track of the accumulator ) ); } static CV_ALWAYS_INLINE __m512i read_64_bytes(const __m512i *srcs, const __mmask64 data_mask) { return _mm512_maskz_loadu_epi8(data_mask, srcs); } static CV_ALWAYS_INLINE __m128i extract_lower_16bytes(const __m512i src_64byte_chunk) { return _mm512_extracti64x2_epi64(src_64byte_chunk, 0x0); } static CV_ALWAYS_INLINE __m512i convert_lower_8bytes_to_longs(const __m128i src_16bytes) { return _mm512_cvtepu8_epi64(src_16bytes); } static CV_ALWAYS_INLINE __m512i square_m512(const __m512i src_longs) { return _mm512_mullo_epi64(src_longs, src_longs); } static CV_ALWAYS_INLINE __m128i shift_right_8_bytes(const __m128i src_16bytes) { return _mm_maskz_compress_epi64(2, src_16bytes); } static CV_ALWAYS_INLINE __m512i shift_right_16_bytes(const __m512i src_64byte_chunk) { return _mm512_maskz_compress_epi64(0xFC, src_64byte_chunk); } static CV_ALWAYS_INLINE __m512i m512_hadd(const __m512i a){ return _mm512_add_epi64(_mm512_maskz_compress_epi64(0xAA, a), _mm512_maskz_compress_epi64(0x55, a)); } // The calculate_integral function referenced here must be implemented in the templated derivatives // because the algorithm depends heavily on the number of channels in the image // This is the incomplete definition (just the prototype) here. // static CV_ALWAYS_INLINE __m512d calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator); }; //============================================================================================================ // This the only section that needs to change with respect to algorithm based on the number of channels // It is responsible for returning the calculation of 8 lanes worth of the integral and returning in the // accumulated sums in the accumulator parameter (NOTE: accumulator is an input and output parameter) // // The function prototype that needs to be implemented is: // // __m512d calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator){ ... } // // Description of parameters: // INPUTS: // src_longs : 8 lanes worth of the source bytes converted to 64 bit integers // above_values: 8 lanes worth of the result values from the line above (See the integral spec) // accumulator : 8 lanes worth of sums from the previous iteration // IMPORTANT NOTE: This parameter is both an INPUT AND OUTPUT parameter to this function // // OUTPUTS: // return value: The actual integral value for all 8 lanes which is defined by the spec as // the sum of all channel values to the left of a given pixel plus the result // written to the line directly above the current line. // accumulator: This is an input and and output. This parameter should be left with the accumulated // sums for the current 8 lanes keeping all entries in the proper lane (do not shuffle it) // // Below here is the channel specific implementation // //======================================== // 1 Channel Integral Implementation //======================================== template<> CV_ALWAYS_INLINE __m512d IntegralCalculator < 1 > ::calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator) { // One channel support is implemented differently than 2, 3, or 4 channel // One channel support has more horizontal operations that cannot be made vertical without losing performance // The logical operations needed look like: // Vertical LANES : |7|6|5|4|3|2|1|0| // src_longs : |H|G|F|E|D|C|B|A| // shift_by_1 : + |G|F|E|D|C|B|A| | // shift_by_2 : + |F|E|D|C|B|A| | | // shift_by_3 : + |E|D|C|B|A| | | | // shift_by_4 : + |D|C|B|A| | | | | // shift_by_5 : + |C|B|A| | | | | | // shift_by_6 : + |B|A| | | | | | | // shift_by_7 : + |A| | | | | | | | // carry_over_idxs : + |7|7|7|7|7|7|7|7| (index position of result from previous iteration) // = integral // // If we do this vertically we end up losing performance because of the number of operations. We will instead // do a horizontal add tree to create the vertical sections we need as a tree // Vertical Lanes: | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | // src_longs: | H | G | F | E | D | C | B | A | // horiz_sum_1: | | | | | G+H | E+F | C+D | A+B | // horiz_sum_2: | | | | | | | EFGH | ABCD | // const __m512i horiz_sum_1 = m512_hadd(src_longs); // indexes for the permutes below (3,2,1,0) = (GH, EF, CD, AB) const __m512i horiz_sum_2 = m512_hadd(horiz_sum_1); // indexes for the permutes below (9, 8) = (EFGH, ABCD) // Then we can use the partial sums by looking at the vertical stacks above and realize that, for example // ABCD appears vertically in lanes 7, 6, 5, 4, and 3 so we will permute the values so that all partial products // appear in the right lanes. and sum them up along with the carry over value from the accumulator. So we setup // the lanes like: // Vertical Lanes: | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | // s1 : | 0 | G | 0 | E | 0 | C | 0 | A | // s2 : | ABCD | ABCD | ABCD | ABCD | ABCD | AB | AB | 0 | // s3 : | EFGH | EF | EF | 0 | 0 | 0 | 0 | 0 | // +------+------+------+------+------+------+------+------+ // sum : | A..H | A..G | A..F | A..E | A..D | A..C | A..B | A | Integral :-) // const __m512i s1 = _mm512_maskz_mov_epi64(0x55, src_longs); // 0 G 0 E 0 D 0 C 0 A const __m512i s2 = _mm512_permutex2var_epi64(horiz_sum_1, _mm512_set_epi64(8,8,8,8,8,0,0,4), horiz_sum_2); const __m512i s3 = _mm512_permutex2var_epi64(horiz_sum_1, _mm512_set_epi64(9,2,2,4,4,4,4,4), horiz_sum_2); // Now we use the rolling sum from the previous iteration from accumulator and replicate it into carry_over // And sum everything up into the accumulator // const __m512i carry_over = _mm512_permutex2var_epi64(accumulator, _mm512_set_epi64(7,7,7,7,7,7,7,7), accumulator); accumulator = _mm512_add_epi64(_mm512_add_epi64(s2, s3), _mm512_add_epi64(carry_over, s1)); // Convert to double precision and store // __m512d integral_pd = _mm512_add_pd(_mm512_cvtepu64_pd(accumulator), above_values); return integral_pd; } //======================================== // 2 Channel Integral Implementation //======================================== template<> CV_ALWAYS_INLINE __m512d IntegralCalculator < 2 > ::calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator) { __m512i carryover_idxs = _mm512_set_epi64(7, 6, 7, 6, 7, 6, 7, 6); // Align data to prepare for the adds: // shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes // Vertical LANES : 76543210 // src_longs : HGFEDCBA // shift2lanes : + FEDCBA // shift4lanes : + DCBA // shift6lanes : + BA // carry_over_idxs : + 76767676 (index position of result from previous iteration) // = integral __m512i shift2lanes = _mm512_maskz_expand_epi64(0xFC, src_longs); __m512i shift4lanes = _mm512_maskz_expand_epi64(0xF0, src_longs); __m512i shift6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs); __m512i carry_over = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator); // Add all values in tree form for perf ((0+2) + (4+6)) __m512i sum_shift_02 = _mm512_add_epi64(src_longs, shift2lanes); __m512i sum_shift_46 = _mm512_add_epi64(shift4lanes, shift6lanes); __m512i sum_all = _mm512_add_epi64(sum_shift_02, sum_shift_46); accumulator = _mm512_add_epi64(sum_all, carry_over); // Convert to packed double and add to the line above to get the true integral value __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator); __m512d integral_pd = _mm512_add_pd(accumulator_pd, above_values); return integral_pd; } //======================================== // 3 Channel Integral Implementation //======================================== template<> CV_ALWAYS_INLINE __m512d IntegralCalculator < 3 > ::calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator) { __m512i carryover_idxs = _mm512_set_epi64(6, 5, 7, 6, 5, 7, 6, 5); // Align data to prepare for the adds: // shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes // Vertical LANES: 76543210 // src_longs : HGFEDCBA // shit3lanes : + EDCBA // shift6lanes : + BA // carry_over_idxs : + 65765765 (index position of result from previous iteration) // = integral __m512i shift3lanes = _mm512_maskz_expand_epi64(0xF8, src_longs); __m512i shift6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs); __m512i carry_over = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator); // Do the adds in tree form __m512i sum_shift_03 = _mm512_add_epi64(src_longs, shift3lanes); __m512i sum_rest = _mm512_add_epi64(shift6lanes, carry_over); accumulator = _mm512_add_epi64(sum_shift_03, sum_rest); // Convert to packed double and add to the line above to get the true integral value __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator); __m512d integral_pd = _mm512_add_pd(accumulator_pd, above_values); return integral_pd; } //======================================== // 4 Channel Integral Implementation //======================================== template<> CV_ALWAYS_INLINE __m512d IntegralCalculator < 4 > ::calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator) { __m512i carryover_idxs = _mm512_set_epi64(7, 6, 5, 4, 7, 6, 5, 4); // Align data to prepare for the adds: // shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes // Vertical LANES: 76543210 // src_longs : HGFEDCBA // shit4lanes : + DCBA // carry_over_idxs : + 76547654 (index position of result from previous iteration) // = integral __m512i shifted4lanes = _mm512_maskz_expand_epi64(0xF0, src_longs); __m512i carry_over = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator); // Add data pixels and carry over from last iteration __m512i sum_shift_04 = _mm512_add_epi64(src_longs, shifted4lanes); accumulator = _mm512_add_epi64(sum_shift_04, carry_over); // Convert to packed double and add to the line above to get the true integral value __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator); __m512d integral_pd = _mm512_add_pd(accumulator_pd, above_values); return integral_pd; } } // end of anonymous namespace static void calculate_integral_avx512(const uchar *src, size_t _srcstep, double *sum, size_t _sumstep, double *sqsum, size_t _sqsumstep, int width, int height, int cn) { CV_INSTRUMENT_REGION(); switch(cn){ case 1: { IntegralCalculator< 1 > calculator; calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height); break; } case 2: { IntegralCalculator< 2 > calculator; calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height); break; } case 3: { IntegralCalculator< 3 > calculator; calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height); break; } case 4: { IntegralCalculator< 4 > calculator; calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height); } } } CV_CPU_OPTIMIZATION_NAMESPACE_END }} // end namespace cv::hal