diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp index f2915850f9..46a470bc0f 100644 --- a/modules/calib3d/include/opencv2/calib3d.hpp +++ b/modules/calib3d/include/opencv2/calib3d.hpp @@ -1810,8 +1810,7 @@ public: MODE_SGBM = 0, MODE_HH = 1, MODE_SGBM_3WAY = 2, - MODE_HH4 = 3, - MODE_HH4_OLD = 4 + MODE_HH4 = 3 }; CV_WRAP virtual int getPreFilterCap() const = 0; diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp index 8a6bfebc67..5f841775ec 100644 --- a/modules/calib3d/src/stereosgbm.cpp +++ b/modules/calib3d/src/stereosgbm.cpp @@ -1170,6 +1170,402 @@ struct CalcVerticalSums: public ParallelLoopBody int ftzero; }; +struct CalcHorizontalSums: public ParallelLoopBody +{ + CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params, + CostType* alignedBuf): img1(_img1), img2(_img2), disp1(_disp1) + { + minD = params.minDisparity; + maxD = minD + params.numDisparities; + P1 = params.P1 > 0 ? params.P1 : 2; + P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1); //TODO: think about P1/S(x,y) Proportion + uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10; + disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1; + height = img1.rows; + width = img1.cols; + minX1 = std::max(maxD, 0); + maxX1 = width + std::min(minD, 0); + INVALID_DISP = minD - 1; + INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; + D = maxD - minD; + width1 = maxX1 - minX1; + costBufSize = width1*D; + CSBufSize = costBufSize*height; + D2 = D + 16; + LrSize = 2 * D2; //TODO: Check: do we need this value or not? + Cbuf = alignedBuf; + Sbuf = Cbuf + CSBufSize; + } + + void operator()( const Range& range ) const + { + int y1 = range.start, y2 = range.end; + static const CostType MAX_COST = SHRT_MAX; + static const int ALIGN = 16; + size_t auxBufsSize = LrSize + width*(sizeof(CostType) + sizeof(DispType)) + 32; + + Mat auxBuff; + auxBuff.create(1, (int)auxBufsSize, CV_8U); + CostType *Lr = (CostType*)alignPtr(auxBuff.ptr(), ALIGN) + 8; + CostType* disp2cost = Lr + LrSize; + DispType* disp2ptr = (DispType*)(disp2cost + width); + + CostType minLr; + + for( int y = y1; y != y2; y++) + { + int x, d; + DispType* disp1ptr = disp1.ptr(y); + CostType* C = Cbuf + y*costBufSize; + CostType* S = Sbuf + y*costBufSize; + + for( x = 0; x < width; x++ ) + { + disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED; + disp2cost[x] = MAX_COST; + } + + // clear buffers + memset( Lr - 8, 0, LrSize*sizeof(CostType) ); + minLr = 0; + /* + [formula 13 in the paper] + compute L_r(p, d) = C(p, d) + + min(L_r(p-r, d), + L_r(p-r, d-1) + P1, + L_r(p-r, d+1) + P1, + min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k) + where p = (x,y), r is one of the directions. + we process all the directions at once: + 0: r=(-dx, 0) + 1: r=(-1, -dy) + 2: r=(0, -dy) + 3: r=(1, -dy) + 4: r=(-2, -dy) + 5: r=(-1, -dy*2) + 6: r=(1, -dy*2) + 7: r=(2, -dy) + */ + for( x = 0; x != width1; x++) + { + int delta = minLr + P2; + + CostType* Lr_ppr = Lr + ((x&1)? 0 : D2); + + Lr_ppr[-1] = Lr_ppr[D] = MAX_COST; //TODO: Well, probably, it's better do this once before. + + CostType* Lr_p = Lr + ((x&1)? D2 :0); + const CostType* Cp = C + x*D; + CostType* Sp = S + x*D; + +// #if CV_SIMD128 +// if( useSIMD ) +// { +// v_int16x8 _P1 = v_setall_s16((short)P1); +// +// v_int16x8 _delta0 = v_setall_s16((short)delta0); +// v_int16x8 _delta1 = v_setall_s16((short)delta1); +// v_int16x8 _delta2 = v_setall_s16((short)delta2); +// v_int16x8 _delta3 = v_setall_s16((short)delta3); +// v_int16x8 _minL0 = v_setall_s16((short)MAX_COST); +// +// for( d = 0; d < D; d += 8 ) +// { +// v_int16x8 Cpd = v_load(Cp + d); +// v_int16x8 L0, L1, L2, L3; +// +// L0 = v_load(Lr_ppr + d); +// L1 = v_load(Lr_p1 + d); +// L2 = v_load(Lr_p2 + d); +// L3 = v_load(Lr_p3 + d); +// +// L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1)); +// L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1)); +// +// L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1)); +// L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1)); +// +// L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1)); +// L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1)); +// +// L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1)); +// L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1)); +// +// L0 = v_min(L0, _delta0); +// L0 = ((L0 - _delta0) + Cpd); +// +// L1 = v_min(L1, _delta1); +// L1 = ((L1 - _delta1) + Cpd); +// +// L2 = v_min(L2, _delta2); +// L2 = ((L2 - _delta2) + Cpd); +// +// L3 = v_min(L3, _delta3); +// L3 = ((L3 - _delta3) + Cpd); +// +// v_store(Lr_p + d, L0); +// v_store(Lr_p + d + D2, L1); +// v_store(Lr_p + d + D2*2, L2); +// v_store(Lr_p + d + D2*3, L3); +// +// // Get minimum from in L0-L3 +// v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H; +// v_zip(L0, L2, t02L, t02H); // L0[0] L2[0] L0[1] L2[1]... +// v_zip(L1, L3, t13L, t13H); // L1[0] L3[0] L1[1] L3[1]... +// v_int16x8 t02 = v_min(t02L, t02H); // L0[i] L2[i] L0[i] L2[i]... +// v_int16x8 t13 = v_min(t13L, t13H); // L1[i] L3[i] L1[i] L3[i]... +// v_zip(t02, t13, t0123L, t0123H); // L0[i] L1[i] L2[i] L3[i]... +// v_int16x8 t0 = v_min(t0123L, t0123H); +// _minL0 = v_min(_minL0, t0); +// +// v_int16x8 Sval = v_load(Sp + d); +// +// L0 = L0 + L1; +// L2 = L2 + L3; +// Sval = Sval + L0; +// Sval = Sval + L2; +// +// v_store(Sp + d, Sval); +// } +// +// v_int32x4 minL, minH; +// v_expand(_minL0, minL, minH); +// v_pack_store(&minLr[x], v_min(minL, minH)); +// } +// else +// #endif + { + int minL = MAX_COST; + + for( d = 0; d < D; d++ ) + { + int Cpd = Cp[d], L; //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually + + L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta; + + Lr_p[d] = (CostType)L; + minL = std::min(minL, L); + + Sp[d] = saturate_cast(Sp[d] + L); + } + minLr = (CostType)minL; + } + } + + memset( Lr - 8, 0, LrSize*sizeof(CostType) ); + minLr = 0; + + for( x = width1-1; x != -1; x--) + { + int delta = minLr + P2; + + CostType* Lr_ppr = Lr + ((x&1)? 0 :D2); + + Lr_ppr[-1] = Lr_ppr[D] = MAX_COST; + + CostType* Lr_p = Lr + ((x&1)? D2 :0); + const CostType* Cp = C + x*D; + CostType* Sp = S + x*D; + int minS = MAX_COST, bestDisp = -1; + +// #if CV_SIMD128 +// if( useSIMD ) +// { +// v_int16x8 _P1 = v_setall_s16((short)P1); +// +// v_int16x8 _delta0 = v_setall_s16((short)delta0); +// v_int16x8 _delta1 = v_setall_s16((short)delta1); +// v_int16x8 _delta2 = v_setall_s16((short)delta2); +// v_int16x8 _delta3 = v_setall_s16((short)delta3); +// v_int16x8 _minL0 = v_setall_s16((short)MAX_COST); +// +// for( d = 0; d < D; d += 8 ) +// { +// v_int16x8 Cpd = v_load(Cp + d); +// v_int16x8 L0, L1, L2, L3; +// +// L0 = v_load(Lr_ppr + d); +// L1 = v_load(Lr_p1 + d); +// L2 = v_load(Lr_p2 + d); +// L3 = v_load(Lr_p3 + d); +// +// L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1)); +// L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1)); +// +// L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1)); +// L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1)); +// +// L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1)); +// L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1)); +// +// L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1)); +// L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1)); +// +// L0 = v_min(L0, _delta0); +// L0 = ((L0 - _delta0) + Cpd); +// +// L1 = v_min(L1, _delta1); +// L1 = ((L1 - _delta1) + Cpd); +// +// L2 = v_min(L2, _delta2); +// L2 = ((L2 - _delta2) + Cpd); +// +// L3 = v_min(L3, _delta3); +// L3 = ((L3 - _delta3) + Cpd); +// +// v_store(Lr_p + d, L0); +// v_store(Lr_p + d + D2, L1); +// v_store(Lr_p + d + D2*2, L2); +// v_store(Lr_p + d + D2*3, L3); +// +// // Get minimum from in L0-L3 +// v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H; +// v_zip(L0, L2, t02L, t02H); // L0[0] L2[0] L0[1] L2[1]... +// v_zip(L1, L3, t13L, t13H); // L1[0] L3[0] L1[1] L3[1]... +// v_int16x8 t02 = v_min(t02L, t02H); // L0[i] L2[i] L0[i] L2[i]... +// v_int16x8 t13 = v_min(t13L, t13H); // L1[i] L3[i] L1[i] L3[i]... +// v_zip(t02, t13, t0123L, t0123H); // L0[i] L1[i] L2[i] L3[i]... +// v_int16x8 t0 = v_min(t0123L, t0123H); +// _minL0 = v_min(_minL0, t0); +// +// v_int16x8 Sval = v_load(Sp + d); +// +// L0 = L0 + L1; +// L2 = L2 + L3; +// Sval = Sval + L0; +// Sval = Sval + L2; +// +// v_store(Sp + d, Sval); +// } +// +// v_int32x4 minL, minH; +// v_expand(_minL0, minL, minH); +// v_pack_store(&minLr[x], v_min(minL, minH)); +// } +// else +// #endif +//TODO:Next piece of code is came from postprocessing. Be very careful with joining them!!! +// #if CV_SIMD128 +// if( useSIMD ) +// { +// v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1); +// v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8); +// +// for( d = 0; d < D; d+= 8 ) +// { +// v_int16x8 L0 = v_load(Sp + d); +// v_int16x8 mask = L0 < _minS; +// _minS = v_min( L0, _minS ); +// _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask); +// _d8 = _d8 + _8; +// } +// v_int32x4 _d0, _d1; +// v_expand(_minS, _d0, _d1); +// minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1)); +// v_int16x8 v_mask = v_setall_s16((short)minS) == _minS; +// +// _bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask); +// v_expand(_bestDisp, _d0, _d1); +// bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1)); +// } +// else +// #endif + { + int minL = MAX_COST; + + for( d = 0; d < D; d++ ) + { + int Cpd = Cp[d], L; //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually + + L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta; + + Lr_p[d] = (CostType)L; + minL = std::min(minL, L); + + Sp[d] = saturate_cast(Sp[d] + L); + if( Sp[d] < minS ) + { + minS = Sp[d]; + bestDisp = d; + } + } + minLr = (CostType)minL; + } + //Some postprocessing procedures and saving + for( d = 0; d < D; d++ ) + { + if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 ) + break; + } + if( d < D ) + continue; + d = bestDisp; + int _x2 = x + minX1 - d - minD; + if( disp2cost[_x2] > minS ) + { + disp2cost[_x2] = (CostType)minS; + disp2ptr[_x2] = (DispType)(d + minD); + } + + if( 0 < d && d < D-1 ) + { + // do subpixel quadratic interpolation: + // fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1]) + // then find minimum of the parabola. + int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1); + d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2); + } + else + d *= DISP_SCALE; + disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE); + } + //Left-right check sanity procedure + for( x = minX1; x < maxX1; x++ ) + { + // we round the computed disparity both towards -inf and +inf and check + // if either of the corresponding disparities in disp2 is consistent. + // This is to give the computed disparity a chance to look valid if it is. + int d1 = disp1ptr[x]; + if( d1 == INVALID_DISP_SCALED ) + continue; + int _d = d1 >> DISP_SHIFT; + int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT; + int _x = x - _d, x_ = x - d_; + if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff && //4e: To dismiss disparity, we should assure, that there is no any + 0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff ) //4e: chance to understand this as correct. + disp1ptr[x] = (DispType)INVALID_DISP_SCALED; + } + } + } + + static const int NLR = 2; + static const int LrBorder = NLR - 1; + static const int DISP_SHIFT = StereoMatcher::DISP_SHIFT; + static const int DISP_SCALE = (1 << DISP_SHIFT); + const Mat& img1; + const Mat& img2; + Mat& disp1; + CostType* Cbuf; + CostType* Sbuf; + int minD; + int maxD; + int D; + int D2; + int width; + int width1; + int height; + int P1; + int P2; + int minX1; + int maxX1; + size_t costBufSize; + size_t CSBufSize; + size_t LrSize; + int INVALID_DISP; + int INVALID_DISP_SCALED; + int uniquenessRatio; + int disp12MaxDiff; +}; /* This is new experimential version of disparity calculation, which should be parralled after TODO: Don't forget to rewrire this commentaries after @@ -1218,20 +1614,17 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2, const int ALIGN = 16; const int DISP_SHIFT = StereoMatcher::DISP_SHIFT; const int DISP_SCALE = (1 << DISP_SHIFT); - const CostType MAX_COST = SHRT_MAX; - int minD = params.minDisparity, maxD = minD + params.numDisparities; Size SADWindowSize; //4e: SAD means Sum of Absolute Differences SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5; //4e: and this is always square int ftzero = std::max(params.preFilterCap, 15) | 1; //4e:ftzero clips x-derivatives. I think, this story with arrays is about non-realized SIMD method - int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10; - int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1; int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1); //TODO: think about P1/S(x,y) Proportion int k, width = disp1.cols, height = disp1.rows; int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); int D = maxD - minD, width1 = maxX1 - minX1; - int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; int SH2 = SADWindowSize.height/2; + int INVALID_DISP = minD - 1; + int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2; //4e: array is such big due to derivative could be +-8*256 in worst cases PixType clipTab[TAB_SIZE]; @@ -1265,8 +1658,7 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2, int hsumBufNRows = SH2*2 + 2; size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[] costBufSize*hsumBufNRows*sizeof(CostType) + // hsumBuf //4e: TODO: Why we should increase sum window height one more time? - CSBufSize*2*sizeof(CostType) + // C, S //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size - width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2 + CSBufSize*2*sizeof(CostType) + 1024; // C, S //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize ) @@ -1274,1098 +1666,18 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2, // summary cost over different (nDirs) directions CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN); - CostType* Sbuf = Cbuf + CSBufSize; - CostType* hsumBuf = Sbuf + CSBufSize; - - CostType* disp2cost = hsumBuf + costBufSize*hsumBufNRows + (LrSize + minLrSize)*NLR; //4e: It is containers for backwards disparity, made by S[d] too, but with other method - DispType* disp2ptr = (DispType*)(disp2cost + width); // add P2 to every C(x,y). it saves a few operations in the inner loops for(k = 0; k < (int)CSBufSize; k++ ) Cbuf[k] = (CostType)P2; parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, Cbuf, clipTab)); + parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, Cbuf)); -// for( int pass = 1; pass <= 2; pass++ ) //pass=1 or left-to-right pass - { - - CostType *Lr, *minLr; - - { //4e: Yes, and this is done at the end of next cycle, not here. - // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders, - // and will occasionally use negative indices with the arrays - // we need to shift Lr[k] pointers by 1, to give the space for d=-1. - // however, then the alignment will be imperfect, i.e. bad for SSE, - // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment) - Lr = hsumBuf + costBufSize*hsumBufNRows + D2*LrBorder + 8; - memset( Lr - LrBorder*D2 - 8, 0, LrSize*sizeof(CostType) ); - minLr = hsumBuf + costBufSize*hsumBufNRows + LrSize*NLR + LrBorder; - memset( minLr - LrBorder, 0, minLrSize*sizeof(CostType) ); - } - - for( int y = 0; y != height; y++) - { - int x, d; - DispType* disp1ptr = disp1.ptr(y); - CostType* C = Cbuf + y*costBufSize; - CostType* S = Sbuf + y*costBufSize; - - for( x = 0; x < width; x++ ) - { - disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED; - disp2cost[x] = MAX_COST; - } - - // clear the left and the right borders - memset( Lr - D2*LrBorder - 8, 0, D2*LrBorder*sizeof(CostType) ); //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory - memset( Lr + width1*D2 - 8, 0, D2*LrBorder*sizeof(CostType) ); //4e: ...00000000|D2-16 of real costs value(and some of them are zeroes too)|00000000... - memset( minLr - LrBorder, 0, LrBorder*sizeof(CostType) ); - memset( minLr + width1, 0, LrBorder*sizeof(CostType) ); - - /* - [formula 13 in the paper] - compute L_r(p, d) = C(p, d) + - min(L_r(p-r, d), - L_r(p-r, d-1) + P1, - L_r(p-r, d+1) + P1, - min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k) - where p = (x,y), r is one of the directions. - we process all the directions at once: - 0: r=(-dx, 0) - 1: r=(-1, -dy) - 2: r=(0, -dy) - 3: r=(1, -dy) - 4: r=(-2, -dy) - 5: r=(-1, -dy*2) - 6: r=(1, -dy*2) - 7: r=(2, -dy) - */ - for( x = 0; x != width1; x++) - { - int xd = x*D2; - - int delta = minLr[x - 1] + P2; - - CostType* Lr_ppr = Lr + xd - D2; - - Lr_ppr[-1] = Lr_ppr[D] = MAX_COST; - - CostType* Lr_p = Lr + xd; - const CostType* Cp = C + x*D; - CostType* Sp = S + x*D; - -// #if CV_SIMD128 -// if( useSIMD ) -// { -// v_int16x8 _P1 = v_setall_s16((short)P1); -// -// v_int16x8 _delta0 = v_setall_s16((short)delta0); -// v_int16x8 _delta1 = v_setall_s16((short)delta1); -// v_int16x8 _delta2 = v_setall_s16((short)delta2); -// v_int16x8 _delta3 = v_setall_s16((short)delta3); -// v_int16x8 _minL0 = v_setall_s16((short)MAX_COST); -// -// for( d = 0; d < D; d += 8 ) -// { -// v_int16x8 Cpd = v_load(Cp + d); -// v_int16x8 L0, L1, L2, L3; -// -// L0 = v_load(Lr_ppr + d); -// L1 = v_load(Lr_p1 + d); -// L2 = v_load(Lr_p2 + d); -// L3 = v_load(Lr_p3 + d); -// -// L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1)); -// L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1)); -// -// L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1)); -// L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1)); -// -// L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1)); -// L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1)); -// -// L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1)); -// L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1)); -// -// L0 = v_min(L0, _delta0); -// L0 = ((L0 - _delta0) + Cpd); -// -// L1 = v_min(L1, _delta1); -// L1 = ((L1 - _delta1) + Cpd); -// -// L2 = v_min(L2, _delta2); -// L2 = ((L2 - _delta2) + Cpd); -// -// L3 = v_min(L3, _delta3); -// L3 = ((L3 - _delta3) + Cpd); -// -// v_store(Lr_p + d, L0); -// v_store(Lr_p + d + D2, L1); -// v_store(Lr_p + d + D2*2, L2); -// v_store(Lr_p + d + D2*3, L3); -// -// // Get minimum from in L0-L3 -// v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H; -// v_zip(L0, L2, t02L, t02H); // L0[0] L2[0] L0[1] L2[1]... -// v_zip(L1, L3, t13L, t13H); // L1[0] L3[0] L1[1] L3[1]... -// v_int16x8 t02 = v_min(t02L, t02H); // L0[i] L2[i] L0[i] L2[i]... -// v_int16x8 t13 = v_min(t13L, t13H); // L1[i] L3[i] L1[i] L3[i]... -// v_zip(t02, t13, t0123L, t0123H); // L0[i] L1[i] L2[i] L3[i]... -// v_int16x8 t0 = v_min(t0123L, t0123H); -// _minL0 = v_min(_minL0, t0); -// -// v_int16x8 Sval = v_load(Sp + d); -// -// L0 = L0 + L1; -// L2 = L2 + L3; -// Sval = Sval + L0; -// Sval = Sval + L2; -// -// v_store(Sp + d, Sval); -// } -// -// v_int32x4 minL, minH; -// v_expand(_minL0, minL, minH); -// v_pack_store(&minLr[x], v_min(minL, minH)); -// } -// else -// #endif - { - int minL = MAX_COST; - - for( d = 0; d < D; d++ ) - { - int Cpd = Cp[d], L; //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually - - L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta; - - Lr_p[d] = (CostType)L; - minL = std::min(minL, L); - - Sp[d] = saturate_cast(Sp[d] + L); - } - minLr[x] = (CostType)minL; - } - } - - for( x = width1-1; x != -1; x--) - { - int xd = x*D2; - - int delta = minLr[x + 1] + P2; - - CostType* Lr_ppr = Lr + xd + D2; - - Lr_ppr[-1] = Lr_ppr[D] = MAX_COST; - - CostType* Lr_p = Lr + xd; - const CostType* Cp = C + x*D; - CostType* Sp = S + x*D; - int minS = MAX_COST, bestDisp = -1; - -// #if CV_SIMD128 -// if( useSIMD ) -// { -// v_int16x8 _P1 = v_setall_s16((short)P1); -// -// v_int16x8 _delta0 = v_setall_s16((short)delta0); -// v_int16x8 _delta1 = v_setall_s16((short)delta1); -// v_int16x8 _delta2 = v_setall_s16((short)delta2); -// v_int16x8 _delta3 = v_setall_s16((short)delta3); -// v_int16x8 _minL0 = v_setall_s16((short)MAX_COST); -// -// for( d = 0; d < D; d += 8 ) -// { -// v_int16x8 Cpd = v_load(Cp + d); -// v_int16x8 L0, L1, L2, L3; -// -// L0 = v_load(Lr_ppr + d); -// L1 = v_load(Lr_p1 + d); -// L2 = v_load(Lr_p2 + d); -// L3 = v_load(Lr_p3 + d); -// -// L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1)); -// L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1)); -// -// L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1)); -// L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1)); -// -// L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1)); -// L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1)); -// -// L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1)); -// L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1)); -// -// L0 = v_min(L0, _delta0); -// L0 = ((L0 - _delta0) + Cpd); -// -// L1 = v_min(L1, _delta1); -// L1 = ((L1 - _delta1) + Cpd); -// -// L2 = v_min(L2, _delta2); -// L2 = ((L2 - _delta2) + Cpd); -// -// L3 = v_min(L3, _delta3); -// L3 = ((L3 - _delta3) + Cpd); -// -// v_store(Lr_p + d, L0); -// v_store(Lr_p + d + D2, L1); -// v_store(Lr_p + d + D2*2, L2); -// v_store(Lr_p + d + D2*3, L3); -// -// // Get minimum from in L0-L3 -// v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H; -// v_zip(L0, L2, t02L, t02H); // L0[0] L2[0] L0[1] L2[1]... -// v_zip(L1, L3, t13L, t13H); // L1[0] L3[0] L1[1] L3[1]... -// v_int16x8 t02 = v_min(t02L, t02H); // L0[i] L2[i] L0[i] L2[i]... -// v_int16x8 t13 = v_min(t13L, t13H); // L1[i] L3[i] L1[i] L3[i]... -// v_zip(t02, t13, t0123L, t0123H); // L0[i] L1[i] L2[i] L3[i]... -// v_int16x8 t0 = v_min(t0123L, t0123H); -// _minL0 = v_min(_minL0, t0); -// -// v_int16x8 Sval = v_load(Sp + d); -// -// L0 = L0 + L1; -// L2 = L2 + L3; -// Sval = Sval + L0; -// Sval = Sval + L2; -// -// v_store(Sp + d, Sval); -// } -// -// v_int32x4 minL, minH; -// v_expand(_minL0, minL, minH); -// v_pack_store(&minLr[x], v_min(minL, minH)); -// } -// else -// #endif -//TODO:Next piece of code is came from postprocessing. Be very careful with joining them!!! -// #if CV_SIMD128 -// if( useSIMD ) -// { -// v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1); -// v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8); -// -// for( d = 0; d < D; d+= 8 ) -// { -// v_int16x8 L0 = v_load(Sp + d); -// v_int16x8 mask = L0 < _minS; -// _minS = v_min( L0, _minS ); -// _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask); -// _d8 = _d8 + _8; -// } -// v_int32x4 _d0, _d1; -// v_expand(_minS, _d0, _d1); -// minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1)); -// v_int16x8 v_mask = v_setall_s16((short)minS) == _minS; -// -// _bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask); -// v_expand(_bestDisp, _d0, _d1); -// bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1)); -// } -// else -// #endif - { - int minL = MAX_COST; - - for( d = 0; d < D; d++ ) - { - int Cpd = Cp[d], L; //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually - - L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta; - - Lr_p[d] = (CostType)L; - minL = std::min(minL, L); - - Sp[d] = saturate_cast(Sp[d] + L); - if( Sp[d] < minS ) - { - minS = Sp[d]; - bestDisp = d; - } - } - minLr[x] = (CostType)minL; - } - //Some postprocessing procedures and saving - for( d = 0; d < D; d++ ) - { - if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 ) - break; - } - if( d < D ) - continue; - d = bestDisp; - int _x2 = x + minX1 - d - minD; - if( disp2cost[_x2] > minS ) - { - disp2cost[_x2] = (CostType)minS; - disp2ptr[_x2] = (DispType)(d + minD); - } - - if( 0 < d && d < D-1 ) - { - // do subpixel quadratic interpolation: - // fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1]) - // then find minimum of the parabola. - int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1); - d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2); - } - else - d *= DISP_SCALE; - disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE); - } - //Left-right check sanity procedure - for( x = minX1; x < maxX1; x++ ) - { - // we round the computed disparity both towards -inf and +inf and check - // if either of the corresponding disparities in disp2 is consistent. - // This is to give the computed disparity a chance to look valid if it is. - int d1 = disp1ptr[x]; - if( d1 == INVALID_DISP_SCALED ) - continue; - int _d = d1 >> DISP_SHIFT; - int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT; - int _x = x - _d, x_ = x - d_; - if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff && //4e: To dismiss disparity, we should assure, that there is no any - 0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff ) //4e: chance to understand this as correct. - disp1ptr[x] = (DispType)INVALID_DISP_SCALED; - } - } - } } ////////////////////////////////////////////////////////////////////////////////////////////////////// -/* - This is new experimential version of disparity calculation, which should be parralled after -TODO: Don't forget to rewrire this commentaries after - - computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf. - that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y). - minD <= d < maxD. - disp2full is the reverse disparity map, that is: - disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y) - - note that disp1buf will have the same size as the roi and - disp2full will have the same size as img1 (or img2). - On exit disp2buf is not the final disparity, it is an intermediate result that becomes - final after all the tiles are processed. - - the disparity in disp1buf is written with sub-pixel accuracy - (4 fractional bits, see StereoSGBM::DISP_SCALE), - using quadratic interpolation, while the disparity in disp2buf - is written as is, without interpolation. - - disp2cost also has the same size as img1 (or img2). - It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost. - */ -static void computeDisparitySGBMParallelOld( const Mat& img1, const Mat& img2, - Mat& disp1, const StereoSGBMParams& params, - Mat& buffer ) -{ -//#if CV_SIMD128 -// // maxDisparity is supposed to multiple of 16, so we can forget doing else -// static const uchar LSBTab[] = -// { -// 0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, -// 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, -// 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, -// 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, -// 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, -// 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, -// 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, -// 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 -// }; -// static const v_uint16x8 v_LSB = v_uint16x8(0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); -// -// bool useSIMD = hasSIMD128(); -//#endif - - const int ALIGN = 16; - const int DISP_SHIFT = StereoMatcher::DISP_SHIFT; - const int DISP_SCALE = (1 << DISP_SHIFT); - const CostType MAX_COST = SHRT_MAX; - - int minD = params.minDisparity, maxD = minD + params.numDisparities; - Size SADWindowSize; //4e: SAD means Sum of Absolute Differences - SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5; //4e: and this is always square - int ftzero = std::max(params.preFilterCap, 15) | 1; //4e:ftzero clips x-derivatives. I think, this story with arrays is about non-realized SIMD method - int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10; - int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1; - int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1); //TODO: think about P1/S(x,y) Proportion - int k, width = disp1.cols, height = disp1.rows; - int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); - int D = maxD - minD, width1 = maxX1 - minX1; - int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; - int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2; - const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2; //4e: array is such big due to derivative could be +-8*256 in worst cases - PixType clipTab[TAB_SIZE]; - - for( k = 0; k < TAB_SIZE; k++ ) //4e: If ftzero would = 4, array containment will be = -4 -4 -4 ... -4 -3 -2 -1 0 1 2 3 4 ... 4 4 4 - clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero); - - if( minX1 >= maxX1 ) - { - disp1 = Scalar::all(INVALID_DISP_SCALED); - return; - } - - CV_Assert( D % 16 == 0 ); //TODO: Are you sure? By the way, why not 8? - - // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8. - // if you change NR, please, modify the loop as well. - int D2 = D+16; //4e: Somewhere in code we need d+1, so D+1. One of simplest solutuons is increasing D-dimension on 1. But 1 is 16, when storage should be aligned. - - // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer: - // for 8-way dynamic programming we need the current row and - // the previous row, i.e. 2 rows in total - const int NLR = 2; //4e: We assume, that we need one or more previous steps in our linear dynamic(one right here). - const int LrBorder = NLR - 1; //4e: for simplification of calculations we need border for taking previous dynamic solutions. - - // for each possible stereo match (img1(x,y) <=> img2(x-d,y)) - // we keep pixel difference cost (C) and the summary cost over NR directions (S). - // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k) - size_t costBufSize = width1*D; - size_t CSBufSize = costBufSize*height; //4e: For HH mode it's better to keep whole array of costs. - size_t minLrSize = (width1 + LrBorder*2), LrSize = minLrSize*D2; //4e: TODO: Understand why NR2 per pass instead od NR2/2 (Probably, without any reason. That doesn't make code wrong) - int hsumBufNRows = SH2*2 + 2; - size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[] - costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff //4e: TODO: Why we should increase sum window height one more time? - CSBufSize*2*sizeof(CostType) + // C, S //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size - width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost //4e: It is needed for calcPixelCostBT function, as "buffer" value - width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2 - - if( buffer.empty() || !buffer.isContinuous() || - buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize ) - buffer.create(1, (int)totalBufSize, CV_8U); - - // summary cost over different (nDirs) directions - CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN); - CostType* Sbuf = Cbuf + CSBufSize; - CostType* hsumBuf = Sbuf + CSBufSize; - CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows; - - CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR; //4e: It is containers for backwards disparity, made by S[d] too, but with other method - DispType* disp2ptr = (DispType*)(disp2cost + width); - PixType* tempBuf = (PixType*)(disp2ptr + width); - - // add P2 to every C(x,y). it saves a few operations in the inner loops - for(k = 0; k < (int)CSBufSize; k++ ) - Cbuf[k] = (CostType)P2; - - // Two vertical passes - up to down and down to up - for( int pass = 1; pass <= 2; pass++ ) //TODO: rename this magic 2. - { - int y1, y2, dy; - - if( pass == 1 ) - { - y1 = 0; y2 = height; dy = 1; - } - else - { - y1 = height-1; y2 = -1; dy = -1; - } - - CostType *Lr[NLR]={0}, *minLr[NLR]={0}; //4e: arrays for L(x,y,r,d) of previous and current rows and minimums of them - - for( k = 0; k < NLR; k++ ) //4e: One of them is needed, and one of them is stored. So, we need to swap pointer - { //4e: Yes, and this is done at the end of next cycle, not here. - // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders, - // and will occasionally use negative indices with the arrays - // we need to shift Lr[k] pointers by 1, to give the space for d=-1. - // however, then the alignment will be imperfect, i.e. bad for SSE, - // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment) - Lr[k] = pixDiff + costBufSize + LrSize*k + D2*LrBorder + 8; - memset( Lr[k] - LrBorder*D2 - 8, 0, LrSize*sizeof(CostType) ); - minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + LrBorder; - memset( minLr[k] - LrBorder, 0, minLrSize*sizeof(CostType) ); - } - - for( int y = y1; y != y2; y += dy ) - { - int x, d; - CostType* C = Cbuf + y*costBufSize; - CostType* S = Sbuf + y*costBufSize; - - if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any. - { - int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1; //4e: for first line's block sum we need calculate half-window of costs and only one for other - - for( k = dy1; k <= dy2; k++ ) - { - CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize; //4e: Ring buffer for horizontally summed lines - - if( k < height ) - { - calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero ); - - memset(hsumAdd, 0, D*sizeof(CostType)); - for( x = 0; x <= SW2*D; x += D ) //4e: Calculation summed costs for all disparities in first pixel of line - { - int scale = x == 0 ? SW2 + 1 : 1; - for( d = 0; d < D; d++ ) - hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale); - } - - if( y > 0 ) //4e: We calculate horizontal sums and forming full block sums for y coord by adding this horsums to previous line's sums and subtracting stored lowest - { //4e: horsum in hsumBuf. Exception is case y=0, where we need many iterations per lines to create full blocking sum. - const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize; - const CostType* Cprev = C - costBufSize; //4e: Well, actually y>0, so we don't need this check: y==0 - - for( x = D; x < width1*D; x += D ) - { - const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D); - const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0); - -// #if CV_SIMD128 -// if( useSIMD ) -// { -// for( d = 0; d < D; d += 8 ) -// { -// v_int16x8 hv = v_load(hsumAdd + x - D + d); -// v_int16x8 Cx = v_load(Cprev + x + d); -// v_int16x8 psub = v_load(pixSub + d); -// v_int16x8 padd = v_load(pixAdd + d); -// hv = (hv - psub + padd); -// psub = v_load(hsumSub + x + d); -// Cx = Cx - psub + hv; -// v_store(hsumAdd + x + d, hv); -// v_store(C + x + d, Cx); -// } -// } -// else -// #endif - { - for( d = 0; d < D; d++ ) - { - int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]); - C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]); - } - } - } - } - else - { - for( x = D; x < width1*D; x += D ) //4e: Calcluates horizontal sums if (y==0). This piece of code is calling SH2+1 times and then result is used in different way - { //4e: to create full blocks sum. That's why this code is isolated from upper case. - const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D); - const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0); - - for( d = 0; d < D; d++ ) - hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]); - } - } - } - - if( y == 0 ) //4e: Calculating first full block sum. - { - int scale = k == 0 ? SH2 + 1 : 1; - for( x = 0; x < width1*D; x++ ) - C[x] = (CostType)(C[x] + hsumAdd[x]*scale); - } - } - - // also, clear the S buffer - for( k = 0; k < width1*D; k++ ) //4e: only on first pass, so it keep old information, don't be confused - S[k] = 0; - } - - // clear the left and the right borders - memset( Lr[0] - D2*LrBorder - 8, 0, D2*LrBorder*sizeof(CostType) ); //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory - memset( Lr[0] + width1*D2 - 8, 0, D2*LrBorder*sizeof(CostType) ); //4e: ...00000000|D2-16 of real costs value(and some of them are zeroes too)|00000000... - memset( minLr[0] - LrBorder, 0, LrBorder*sizeof(CostType) ); - memset( minLr[0] + width1, 0, LrBorder*sizeof(CostType) ); - - /* - [formula 13 in the paper] - compute L_r(p, d) = C(p, d) + - min(L_r(p-r, d), - L_r(p-r, d-1) + P1, - L_r(p-r, d+1) + P1, - min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k) - where p = (x,y), r is one of the directions. - we process all the directions at once: - 0: r=(-dx, 0) - 1: r=(-1, -dy) - 2: r=(0, -dy) - 3: r=(1, -dy) - 4: r=(-2, -dy) - 5: r=(-1, -dy*2) - 6: r=(1, -dy*2) - 7: r=(2, -dy) - */ - for( x = 0; x != width1; x++ ) - { - int xd = x*D2; - - int delta = minLr[1][x] + P2; - - CostType* Lr_ppr = Lr[1] + xd; - - Lr_ppr[-1] = Lr_ppr[D] = MAX_COST; - - CostType* Lr_p = Lr[0] + xd; - const CostType* Cp = C + x*D; - CostType* Sp = S + x*D; - -// #if CV_SIMD128 -// if( useSIMD ) -// { -// v_int16x8 _P1 = v_setall_s16((short)P1); -// -// v_int16x8 _delta0 = v_setall_s16((short)delta0); -// v_int16x8 _delta1 = v_setall_s16((short)delta1); -// v_int16x8 _delta2 = v_setall_s16((short)delta2); -// v_int16x8 _delta3 = v_setall_s16((short)delta3); -// v_int16x8 _minL0 = v_setall_s16((short)MAX_COST); -// -// for( d = 0; d < D; d += 8 ) -// { -// v_int16x8 Cpd = v_load(Cp + d); -// v_int16x8 L0, L1, L2, L3; -// -// L0 = v_load(Lr_p0 + d); -// L1 = v_load(Lr_p1 + d); -// L2 = v_load(Lr_ppr + d); -// L3 = v_load(Lr_p3 + d); -// -// L0 = v_min(L0, (v_load(Lr_p0 + d - 1) + _P1)); -// L0 = v_min(L0, (v_load(Lr_p0 + d + 1) + _P1)); -// -// L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1)); -// L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1)); -// -// L2 = v_min(L2, (v_load(Lr_ppr + d - 1) + _P1)); -// L2 = v_min(L2, (v_load(Lr_ppr + d + 1) + _P1)); -// -// L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1)); -// L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1)); -// -// L0 = v_min(L0, _delta0); -// L0 = ((L0 - _delta0) + Cpd); -// -// L1 = v_min(L1, _delta1); -// L1 = ((L1 - _delta1) + Cpd); -// -// L2 = v_min(L2, _delta2); -// L2 = ((L2 - _delta2) + Cpd); -// -// L3 = v_min(L3, _delta3); -// L3 = ((L3 - _delta3) + Cpd); -// -// v_store(Lr_p + d, L0); -// v_store(Lr_p + d + D2, L1); -// v_store(Lr_p + d + D2*2, L2); -// v_store(Lr_p + d + D2*3, L3); -// -// // Get minimum from in L0-L3 -// v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H; -// v_zip(L0, L2, t02L, t02H); // L0[0] L2[0] L0[1] L2[1]... -// v_zip(L1, L3, t13L, t13H); // L1[0] L3[0] L1[1] L3[1]... -// v_int16x8 t02 = v_min(t02L, t02H); // L0[i] L2[i] L0[i] L2[i]... -// v_int16x8 t13 = v_min(t13L, t13H); // L1[i] L3[i] L1[i] L3[i]... -// v_zip(t02, t13, t0123L, t0123H); // L0[i] L1[i] L2[i] L3[i]... -// v_int16x8 t0 = v_min(t0123L, t0123H); -// _minL0 = v_min(_minL0, t0); -// -// v_int16x8 Sval = v_load(Sp + d); -// -// L0 = L0 + L1; -// L2 = L2 + L3; -// Sval = Sval + L0; -// Sval = Sval + L2; -// -// v_store(Sp + d, Sval); -// } -// -// v_int32x4 minL, minH; -// v_expand(_minL0, minL, minH); -// v_pack_store(&minLr[0][x], v_min(minL, minH)); -// } -// else -// #endif - { - int minL = MAX_COST; - - for( d = 0; d < D; d++ ) - { - int Cpd = Cp[d], L; //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually - - L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta; - - Lr_p[d] = (CostType)L; - minL = std::min(minL, L); - - Sp[d] = saturate_cast(Sp[d] + L); - } - minLr[0][x] = (CostType)minL; - } - } - - // now shift the cyclic buffers - std::swap( Lr[0], Lr[1] ); - std::swap( minLr[0], minLr[1] ); - } - } -// for(int y = 0; y(y); - CostType* C = Cbuf + y*costBufSize; - CostType* S = Sbuf + y*costBufSize; - - for( x = 0; x < width; x++ ) - { - disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED; - disp2cost[x] = MAX_COST; - } - - // clear the left and the right borders - memset( Lr - D2*LrBorder - 8, 0, D2*LrBorder*sizeof(CostType) ); //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory - memset( Lr + width1*D2 - 8, 0, D2*LrBorder*sizeof(CostType) ); //4e: ...00000000|D2-16 of real costs value(and some of them are zeroes too)|00000000... - memset( minLr - LrBorder, 0, LrBorder*sizeof(CostType) ); - memset( minLr + width1, 0, LrBorder*sizeof(CostType) ); - - /* - [formula 13 in the paper] - compute L_r(p, d) = C(p, d) + - min(L_r(p-r, d), - L_r(p-r, d-1) + P1, - L_r(p-r, d+1) + P1, - min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k) - where p = (x,y), r is one of the directions. - we process all the directions at once: - 0: r=(-dx, 0) - 1: r=(-1, -dy) - 2: r=(0, -dy) - 3: r=(1, -dy) - 4: r=(-2, -dy) - 5: r=(-1, -dy*2) - 6: r=(1, -dy*2) - 7: r=(2, -dy) - */ - for( x = 0; x != width1; x++) - { - int xd = x*D2; - - int delta = minLr[x - 1] + P2; - - CostType* Lr_ppr = Lr + xd - D2; - - Lr_ppr[-1] = Lr_ppr[D] = MAX_COST; - - CostType* Lr_p = Lr + xd; - const CostType* Cp = C + x*D; - CostType* Sp = S + x*D; - -// #if CV_SIMD128 -// if( useSIMD ) -// { -// v_int16x8 _P1 = v_setall_s16((short)P1); -// -// v_int16x8 _delta0 = v_setall_s16((short)delta0); -// v_int16x8 _delta1 = v_setall_s16((short)delta1); -// v_int16x8 _delta2 = v_setall_s16((short)delta2); -// v_int16x8 _delta3 = v_setall_s16((short)delta3); -// v_int16x8 _minL0 = v_setall_s16((short)MAX_COST); -// -// for( d = 0; d < D; d += 8 ) -// { -// v_int16x8 Cpd = v_load(Cp + d); -// v_int16x8 L0, L1, L2, L3; -// -// L0 = v_load(Lr_ppr + d); -// L1 = v_load(Lr_p1 + d); -// L2 = v_load(Lr_p2 + d); -// L3 = v_load(Lr_p3 + d); -// -// L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1)); -// L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1)); -// -// L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1)); -// L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1)); -// -// L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1)); -// L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1)); -// -// L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1)); -// L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1)); -// -// L0 = v_min(L0, _delta0); -// L0 = ((L0 - _delta0) + Cpd); -// -// L1 = v_min(L1, _delta1); -// L1 = ((L1 - _delta1) + Cpd); -// -// L2 = v_min(L2, _delta2); -// L2 = ((L2 - _delta2) + Cpd); -// -// L3 = v_min(L3, _delta3); -// L3 = ((L3 - _delta3) + Cpd); -// -// v_store(Lr_p + d, L0); -// v_store(Lr_p + d + D2, L1); -// v_store(Lr_p + d + D2*2, L2); -// v_store(Lr_p + d + D2*3, L3); -// -// // Get minimum from in L0-L3 -// v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H; -// v_zip(L0, L2, t02L, t02H); // L0[0] L2[0] L0[1] L2[1]... -// v_zip(L1, L3, t13L, t13H); // L1[0] L3[0] L1[1] L3[1]... -// v_int16x8 t02 = v_min(t02L, t02H); // L0[i] L2[i] L0[i] L2[i]... -// v_int16x8 t13 = v_min(t13L, t13H); // L1[i] L3[i] L1[i] L3[i]... -// v_zip(t02, t13, t0123L, t0123H); // L0[i] L1[i] L2[i] L3[i]... -// v_int16x8 t0 = v_min(t0123L, t0123H); -// _minL0 = v_min(_minL0, t0); -// -// v_int16x8 Sval = v_load(Sp + d); -// -// L0 = L0 + L1; -// L2 = L2 + L3; -// Sval = Sval + L0; -// Sval = Sval + L2; -// -// v_store(Sp + d, Sval); -// } -// -// v_int32x4 minL, minH; -// v_expand(_minL0, minL, minH); -// v_pack_store(&minLr[x], v_min(minL, minH)); -// } -// else -// #endif - { - int minL = MAX_COST; - - for( d = 0; d < D; d++ ) - { - int Cpd = Cp[d], L; //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually - - L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta; - - Lr_p[d] = (CostType)L; - minL = std::min(minL, L); - - Sp[d] = saturate_cast(Sp[d] + L); - } - minLr[x] = (CostType)minL; - } - } - - for( x = width1-1; x != -1; x--) - { - int xd = x*D2; - - int delta = minLr[x + 1] + P2; - - CostType* Lr_ppr = Lr + xd + D2; - - Lr_ppr[-1] = Lr_ppr[D] = MAX_COST; - - CostType* Lr_p = Lr + xd; - const CostType* Cp = C + x*D; - CostType* Sp = S + x*D; - int minS = MAX_COST, bestDisp = -1; - -// #if CV_SIMD128 -// if( useSIMD ) -// { -// v_int16x8 _P1 = v_setall_s16((short)P1); -// -// v_int16x8 _delta0 = v_setall_s16((short)delta0); -// v_int16x8 _delta1 = v_setall_s16((short)delta1); -// v_int16x8 _delta2 = v_setall_s16((short)delta2); -// v_int16x8 _delta3 = v_setall_s16((short)delta3); -// v_int16x8 _minL0 = v_setall_s16((short)MAX_COST); -// -// for( d = 0; d < D; d += 8 ) -// { -// v_int16x8 Cpd = v_load(Cp + d); -// v_int16x8 L0, L1, L2, L3; -// -// L0 = v_load(Lr_ppr + d); -// L1 = v_load(Lr_p1 + d); -// L2 = v_load(Lr_p2 + d); -// L3 = v_load(Lr_p3 + d); -// -// L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1)); -// L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1)); -// -// L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1)); -// L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1)); -// -// L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1)); -// L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1)); -// -// L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1)); -// L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1)); -// -// L0 = v_min(L0, _delta0); -// L0 = ((L0 - _delta0) + Cpd); -// -// L1 = v_min(L1, _delta1); -// L1 = ((L1 - _delta1) + Cpd); -// -// L2 = v_min(L2, _delta2); -// L2 = ((L2 - _delta2) + Cpd); -// -// L3 = v_min(L3, _delta3); -// L3 = ((L3 - _delta3) + Cpd); -// -// v_store(Lr_p + d, L0); -// v_store(Lr_p + d + D2, L1); -// v_store(Lr_p + d + D2*2, L2); -// v_store(Lr_p + d + D2*3, L3); -// -// // Get minimum from in L0-L3 -// v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H; -// v_zip(L0, L2, t02L, t02H); // L0[0] L2[0] L0[1] L2[1]... -// v_zip(L1, L3, t13L, t13H); // L1[0] L3[0] L1[1] L3[1]... -// v_int16x8 t02 = v_min(t02L, t02H); // L0[i] L2[i] L0[i] L2[i]... -// v_int16x8 t13 = v_min(t13L, t13H); // L1[i] L3[i] L1[i] L3[i]... -// v_zip(t02, t13, t0123L, t0123H); // L0[i] L1[i] L2[i] L3[i]... -// v_int16x8 t0 = v_min(t0123L, t0123H); -// _minL0 = v_min(_minL0, t0); -// -// v_int16x8 Sval = v_load(Sp + d); -// -// L0 = L0 + L1; -// L2 = L2 + L3; -// Sval = Sval + L0; -// Sval = Sval + L2; -// -// v_store(Sp + d, Sval); -// } -// -// v_int32x4 minL, minH; -// v_expand(_minL0, minL, minH); -// v_pack_store(&minLr[x], v_min(minL, minH)); -// } -// else -// #endif -//TODO:Next piece of code is came from postprocessing. Be very careful with joining them!!! -// #if CV_SIMD128 -// if( useSIMD ) -// { -// v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1); -// v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8); -// -// for( d = 0; d < D; d+= 8 ) -// { -// v_int16x8 L0 = v_load(Sp + d); -// v_int16x8 mask = L0 < _minS; -// _minS = v_min( L0, _minS ); -// _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask); -// _d8 = _d8 + _8; -// } -// v_int32x4 _d0, _d1; -// v_expand(_minS, _d0, _d1); -// minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1)); -// v_int16x8 v_mask = v_setall_s16((short)minS) == _minS; -// -// _bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask); -// v_expand(_bestDisp, _d0, _d1); -// bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1)); -// } -// else -// #endif - { - int minL = MAX_COST; - - for( d = 0; d < D; d++ ) - { - int Cpd = Cp[d], L; //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually - - L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta; - - Lr_p[d] = (CostType)L; - minL = std::min(minL, L); - - Sp[d] = saturate_cast(Sp[d] + L); - if( Sp[d] < minS ) - { - minS = Sp[d]; - bestDisp = d; - } - } - minLr[x] = (CostType)minL; - } - //Some postprocessing procedures and saving - for( d = 0; d < D; d++ ) - { - if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 ) - break; - } - if( d < D ) - continue; - d = bestDisp; - int _x2 = x + minX1 - d - minD; - if( disp2cost[_x2] > minS ) - { - disp2cost[_x2] = (CostType)minS; - disp2ptr[_x2] = (DispType)(d + minD); - } - - if( 0 < d && d < D-1 ) - { - // do subpixel quadratic interpolation: - // fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1]) - // then find minimum of the parabola. - int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1); - d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2); - } - else - d *= DISP_SCALE; - disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE); - } - //Left-right check sanity procedure - for( x = minX1; x < maxX1; x++ ) - { - // we round the computed disparity both towards -inf and +inf and check - // if either of the corresponding disparities in disp2 is consistent. - // This is to give the computed disparity a chance to look valid if it is. - int d1 = disp1ptr[x]; - if( d1 == INVALID_DISP_SCALED ) - continue; - int _d = d1 >> DISP_SHIFT; - int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT; - int _x = x - _d, x_ = x - d_; - if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff && //4e: To dismiss disparity, we should assure, that there is no any - 0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff ) //4e: chance to understand this as correct. - disp1ptr[x] = (DispType)INVALID_DISP_SCALED; - } - } - } -} -////////////////////////////////////////////////////////////////////////////////////////////////////// - void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2, CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff, PixType*& tmpBuf, CostType*& horPassCostVolume, @@ -3018,8 +2330,6 @@ public: if(params.mode==MODE_SGBM_3WAY) computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes ); - else if(params.mode==MODE_HH4_OLD) - computeDisparitySGBMParallelOld( left, right, disp, params, buffer ); else if(params.mode==MODE_HH4) computeDisparitySGBMParallel( left, right, disp, params, buffer ); else diff --git a/modules/calib3d/test/test_stereomatching.cpp b/modules/calib3d/test/test_stereomatching.cpp index e1b5c5200e..9a338bb11f 100644 --- a/modules/calib3d/test/test_stereomatching.cpp +++ b/modules/calib3d/test/test_stereomatching.cpp @@ -797,23 +797,25 @@ TEST(Calib3d_StereoSGBMPar, idontknowhowtotesthere) // int mode) Mat leftImg = imread("/home/q/Work/GitVault/opencv_extra/testdata/cv/stereomatching/datasets/teddy/im2.png"); Mat rightImg = imread("/home/q/Work/GitVault/opencv_extra/testdata/cv/stereomatching/datasets/teddy/im6.png"); - Mat leftDisp_old, leftDisp_new; +// Mat leftDisp_old, leftDisp_new; + { + Mat leftDisp; + Ptr sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH); + sgbm->compute( leftImg, rightImg, leftDisp); + CV_Assert( leftDisp.type() == CV_16SC1 ); + leftDisp/=8; + imwrite( "/home/q/Work/GitVault/modehh4_new.jpg", leftDisp); + } { Mat leftDisp; Ptr sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH4); - sgbm->compute( leftImg, rightImg, leftDisp_new ); - CV_Assert( leftDisp_new.type() == CV_16SC1 ); -// leftDisp/=8; -// imwrite( "/home/q/Work/GitVault/modehh4_new.jpg", leftDisp); + sgbm->compute( leftImg, rightImg, leftDisp); + CV_Assert( leftDisp.type() == CV_16SC1 ); + leftDisp/=8; + imwrite( "/home/q/Work/GitVault/modehh4_old.jpg", leftDisp); } - { - Ptr sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH4_OLD); - sgbm->compute( leftImg, rightImg, leftDisp_old ); - CV_Assert( leftDisp_old.type() == CV_16SC1 ); -// leftDisp/=8; -// imwrite( "/home/q/Work/GitVault/modehh4_old.jpg", leftDisp); - } - Mat diff; - absdiff(leftDisp_old,leftDisp_new,diff); - CV_Assert( countNonZero(diff)==0); +// Mat diff; +// absdiff(leftDisp_old,leftDisp_new,diff); +// CV_Assert( countNonZero(diff)==0); +// }