Memory repaired + Cleanup.

This commit is contained in:
Pyotr Chekmaryov 2017-04-25 21:00:31 +00:00
parent 7ffa49e02c
commit 21be2aa677
2 changed files with 144 additions and 491 deletions

View File

@ -125,7 +125,6 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
int minD, int maxD, CostType* cost,
PixType* buffer, const PixType* tab,
int tabOfs, int , int xrange_min = 0, int xrange_max = DEFAULT_RIGHT_BORDER )
//TODO: This function was changed and modified old function's behabior. Check they in tests
{
int x, c, width = img1.cols, cn = img1.channels();
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
@ -311,9 +310,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
const CostType MAX_COST = SHRT_MAX;
int minD = params.minDisparity, maxD = minD + params.numDisparities;
Size SADWindowSize; //4e: SAD means Sum of Absolute Differences
SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5; //4e: and this is always square
int ftzero = std::max(params.preFilterCap, 15) | 1; //4e:ftzero clips x-derivatives. I think, this story with arrays is about non-realized SIMD method
Size SADWindowSize;
SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
int ftzero = std::max(params.preFilterCap, 15) | 1;
int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
@ -323,11 +322,11 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
bool fullDP = params.mode == StereoSGBM::MODE_HH;
int npasses = fullDP ? 2 : 1; //4e: 2 passes with different behaviour for Hirshmueller method.
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2; //4e: array is such big due to derivative could be +-8*256 in worst cases
int npasses = fullDP ? 2 : 1;
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
PixType clipTab[TAB_SIZE];
for( k = 0; k < TAB_SIZE; k++ ) //4e: If ftzero would = 4, array containment will be = -4 -4 -4 ... -4 -3 -2 -1 0 1 2 3 4 ... 4 4 4
for( k = 0; k < TAB_SIZE; k++ )
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
if( minX1 >= maxX1 )
@ -340,25 +339,25 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
// NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
// if you change NR, please, modify the loop as well.
int D2 = D+16, NRD2 = NR2*D2; //4e: Somewhere in code we need d+1, so D+1. One of simplest solutuons is increasing D-dimension on 1. But 1 is 16, when storage should be aligned.
int D2 = D+16, NRD2 = NR2*D2;
// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
// for 8-way dynamic programming we need the current row and
// the previous row, i.e. 2 rows in total
const int NLR = 2; //4e: We assume, that we need one or more previous steps in our linear dynamic(one right here).
const int LrBorder = NLR - 1; //4e: for simplification of calculations we need border for taking previous dynamic solutions.
const int NLR = 2;
const int LrBorder = NLR - 1;
// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
// we keep pixel difference cost (C) and the summary cost over NR directions (S).
// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
size_t costBufSize = width1*D;
size_t CSBufSize = costBufSize*(fullDP ? height : 1); //4e: For HH mode it's better to keep whole array of costs.
size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2; //4e: TODO: Understand why NR2 per pass instead od NR2/2 (Probably, without any reason. That doesn't make code wrong)
size_t CSBufSize = costBufSize*(fullDP ? height : 1);
size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
int hsumBufNRows = SH2*2 + 2;
size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff //4e: TODO: Why we should increase sum window height one more time?
CSBufSize*2*sizeof(CostType) + // C, S //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size
width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost //4e: It is needed for calcPixelCostBT function, as "buffer" value
costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
CSBufSize*2*sizeof(CostType) + // C, S
width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
if( buffer.empty() || !buffer.isContinuous() ||
@ -371,7 +370,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
CostType* hsumBuf = Sbuf + CSBufSize;
CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR; //4e: It is containers for backwards disparity, made by S[d] too, but with other method
CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
DispType* disp2ptr = (DispType*)(disp2cost + width);
PixType* tempBuf = (PixType*)(disp2ptr + width);
@ -383,21 +382,21 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
{
int x1, y1, x2, y2, dx, dy;
if( pass == 1 ) //4e: on the first pass, we work down, so calculate directions down, diagdown and right
if( pass == 1 )
{
y1 = 0; y2 = height; dy = 1;
x1 = 0; x2 = width1; dx = 1;
}
else //4e: on the second pass, we work up, so calculate directions up, diagup and up
else
{
y1 = height-1; y2 = -1; dy = -1;
x1 = width1-1; x2 = -1; dx = -1;
}
CostType *Lr[NLR]={0}, *minLr[NLR]={0}; //4e: arrays for L(x,y,r,d) of previous and current rows and minimums of them
CostType *Lr[NLR]={0}, *minLr[NLR]={0};
for( k = 0; k < NLR; k++ ) //4e: One of them is needed, and one of them is stored. So, we need to swap pointer
{ //4e: Yes, and this is done at the end of next cycle, not here.
for( k = 0; k < NLR; k++ )
{
// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
// and will occasionally use negative indices with the arrays
// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
@ -418,28 +417,28 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
{
int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1; //4e: for first line's block sum we need calculate half-window of costs and only one for other
int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
for( k = dy1; k <= dy2; k++ )
{
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize; //4e: Ring buffer for horizontally summed lines
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
if( k < height )
{
calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
memset(hsumAdd, 0, D*sizeof(CostType));
for( x = 0; x <= SW2*D; x += D ) //4e: Calculation summed costs for all disparities in first pixel of line
for( x = 0; x <= SW2*D; x += D )
{
int scale = x == 0 ? SW2 + 1 : 1;
for( d = 0; d < D; d++ )
hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
}
if( y > 0 ) //4e: We calculate horizontal sums and forming full block sums for y coord by adding this horsums to previous line's sums and subtracting stored lowest
{ //4e: horsum in hsumBuf. Exception is case y=0, where we need many iterations per lines to create full blocking sum.
if( y > 0 )
{
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize; //4e: Well, actually y>0, so we don't need this check: y==0
const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
for( x = D; x < width1*D; x += D )
{
@ -475,8 +474,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
}
else
{
for( x = D; x < width1*D; x += D ) //4e: Calcluates horizontal sums if (y==0). This piece of code is calling SH2+1 times and then result is used in different way
{ //4e: to create full blocks sum. That's why this code is isolated from upper case.
for( x = D; x < width1*D; x += D )
{
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
@ -486,7 +485,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
}
}
if( y == 0 ) //4e: Calculating first full block sum.
if( y == 0 )
{
int scale = k == 0 ? SH2 + 1 : 1;
for( x = 0; x < width1*D; x++ )
@ -495,13 +494,13 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
}
// also, clear the S buffer
for( k = 0; k < width1*D; k++ ) //4e: only on first pass, so it keep old information, don't be confused
for( k = 0; k < width1*D; k++ )
S[k] = 0;
}
// clear the left and the right borders
memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) ); //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory
memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) ); //4e: ...00000000|NRD2-16 of real costs value(and some of them are zeroes too)|00000000...
memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
@ -624,7 +623,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
for( d = 0; d < D; d++ )
{
int Cpd = Cp[d], L0, L1, L2, L3; //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
int Cpd = Cp[d], L0, L1, L2, L3;
L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;
@ -665,7 +664,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
CostType* Sp = S + x*D;
int minS = MAX_COST, bestDisp = -1;
if( npasses == 1 ) //4e: in this case we could take fifth direction almost for free(direction "left")
if( npasses == 1 )
{
int xm = x*NR2, xd = xm*D2;
@ -815,7 +814,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
}
for( x = minX1; x < maxX1; x++ ) //4e: Left-right check itself
for( x = minX1; x < maxX1; x++ )
{
// we round the computed disparity both towards -inf and +inf and check
// if either of the corresponding disparities in disp2 is consistent.
@ -826,8 +825,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
int _d = d1 >> DISP_SHIFT;
int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
int _x = x - _d, x_ = x - d_;
if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff && //4e: To dismiss disparity, we should assure, that there is no any
0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff ) //4e: chance to understand this as correct.
if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
}
}
@ -840,8 +839,6 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
}
////////////////////////////////////////////////////////////////////////////////////////////
//TODO: Assumation: Let's pretend, that we allocate memory for pixDiff and tempBuf independently in each thread, with full size, needed for original calcBT
//TODO: Redo size of this arrays even if situation with independent allocation will still.
struct CalcVerticalSums: public ParallelLoopBody
{
CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params,
@ -852,7 +849,7 @@ struct CalcVerticalSums: public ParallelLoopBody
SW2 = SH2 = (params.SADWindowSize > 0 ? params.SADWindowSize : 5)/2;
ftzero = std::max(params.preFilterCap, 15) | 1;
P1 = params.P1 > 0 ? params.P1 : 2;
P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1); //TODO: think about P1/S(x,y) Proportion
P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
height = img1.rows;
width = img1.cols;
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
@ -861,7 +858,7 @@ struct CalcVerticalSums: public ParallelLoopBody
D2 = D + 16;
costBufSize = width1*D;
CSBufSize = costBufSize*height;
minLrSize = (width1 + LrBorder*2);
minLrSize = width1;
LrSize = minLrSize*D2;
hsumBufNRows = SH2*2 + 2;
Cbuf = alignedBuf;
@ -874,10 +871,11 @@ struct CalcVerticalSums: public ParallelLoopBody
static const CostType MAX_COST = SHRT_MAX;
static const int ALIGN = 16;
static const int TAB_OFS = 256*4;
static const int npasses = 2;
int x1 = range.start, x2 = range.end, k;
size_t pixDiffSize = ((x2 - x1) + 2*SW2)*D;
size_t auxBufsSize = pixDiffSize*sizeof(CostType) + //pixdiff size
width*16*img1.channels()*sizeof(PixType) + 32; //tempBuf //TODO: Probably it's better 6 instead of 16(alignment?)
width*16*img1.channels()*sizeof(PixType) + 32; //tempBuf
Mat auxBuff;
auxBuff.create(1, (int)auxBufsSize, CV_8U);
CostType* pixDiff = (CostType*)alignPtr(auxBuff.ptr(), ALIGN);
@ -886,7 +884,7 @@ struct CalcVerticalSums: public ParallelLoopBody
// Simplification of index calculation
pixDiff -= (x1>SW2 ? (x1 - SW2): 0)*D;
for( int pass = 1; pass <= 2; pass++ ) //TODO: rename this magic 2.
for( int pass = 1; pass <= npasses; pass++ )
{
int y1, y2, dy;
@ -899,18 +897,18 @@ struct CalcVerticalSums: public ParallelLoopBody
y1 = height-1; y2 = -1; dy = -1;
}
CostType *Lr[NLR]={0}, *minLr[NLR]={0}; //4e: arrays for L(x,y,r,d) of previous and current rows and minimums of them
CostType *Lr[NLR]={0}, *minLr[NLR]={0};
for( k = 0; k < NLR; k++ ) //4e: One of them is needed, and one of them is stored. So, we need to swap pointer
{ //4e: Yes, and this is done at the end of next cycle, not here.
for( k = 0; k < NLR; k++ )
{
// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
// and will occasionally use negative indices with the arrays
// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
// however, then the alignment will be imperfect, i.e. bad for SSE,
// thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
Lr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*k + D2*LrBorder + 8;
Lr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*k + 8;
memset( Lr[k] + x1*D2 - 8, 0, (x2-x1)*D2*sizeof(CostType) );
minLr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*NLR + minLrSize*k + LrBorder;
minLr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*NLR + minLrSize*k;
memset( minLr[k] + x1, 0, (x2-x1)*sizeof(CostType) );
}
@ -922,60 +920,37 @@ struct CalcVerticalSums: public ParallelLoopBody
if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
{
int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1; //4e: for first line's block sum we need calculate half-window of costs and only one for other
int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
for( k = dy1; k <= dy2; k++ )
{
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize; //4e: Ring buffer for horizontally summed lines
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
if( k < height )
{
calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero, x1 - SW2, x2 + SW2);
memset(hsumAdd + x1*D, 0, D*sizeof(CostType));
for( x = (x1 - SW2)*D; x <= (x1 + SW2)*D; x += D ) //4e: Calculation summed costs for all disparities in first pixel of line
for( x = (x1 - SW2)*D; x <= (x1 + SW2)*D; x += D )
{
int xbord = x <= 0 ? 0 : (x > (width1 - 1)*D? (width1 - 1)*D : x);
for( d = 0; d < D; d++ )
hsumAdd[x1*D + d] = (CostType)(hsumAdd[x1*D + d] + pixDiff[xbord + d]);
}
if( y > 0 ) //4e: We calculate horizontal sums and forming full block sums for y coord by adding this horsums to previous line's sums and subtracting stored lowest
{ //4e: horsum in hsumBuf. Exception is case y=0, where we need many iterations per lines to create full blocking sum.
if( y > 0 )
{
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
const CostType* Cprev = C - costBufSize;
// We need to calculate C[x1] in different way, because hsumadd is already calculated
// We don't doing then for x==0, because original function has forgotten to do this //TODO: Check: does this original still exist?
if(x1!=0)
{
for( d = 0; d < D; d++ )
C[x1*D + d] = (CostType)(Cprev[x1*D + d] + hsumAdd[x1*D + d] - hsumSub[x1*D + d]);
}
for( d = 0; d < D; d++ )
C[x1*D + d] = (CostType)(Cprev[x1*D + d] + hsumAdd[x1*D + d] - hsumSub[x1*D + d]);
for( x = (x1+1)*D; x < x2*D; x += D )
{
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
// #if CV_SIMD128
// if( useSIMD )
// {
// for( d = 0; d < D; d += 8 )
// {
// v_int16x8 hv = v_load(hsumAdd + x - D + d);
// v_int16x8 Cx = v_load(Cprev + x + d);
// v_int16x8 psub = v_load(pixSub + d);
// v_int16x8 padd = v_load(pixAdd + d);
// hv = (hv - psub + padd);
// psub = v_load(hsumSub + x + d);
// Cx = Cx - psub + hv;
// v_store(hsumAdd + x + d, hv);
// v_store(C + x + d, Cx);
// }
// }
// else
// #endif
{
for( d = 0; d < D; d++ )
{
@ -987,8 +962,8 @@ struct CalcVerticalSums: public ParallelLoopBody
}
else
{
for( x = (x1+1)*D; x < x2*D; x += D ) //4e: Calcluates horizontal sums if (y==0). This piece of code is calling SH2+1 times and then result is used in different way
{ //4e: to create full blocks sum. That's why this code is isolated from upper case.
for( x = (x1+1)*D; x < x2*D; x += D )
{
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
@ -996,10 +971,9 @@ struct CalcVerticalSums: public ParallelLoopBody
hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
}
}
// Return to coordinates, which is needed by CalcCostBT
}
if( y == 0 ) //4e: Calculating first full block sum.
if( y == 0 )
{
int scale = k == 0 ? SH2 + 1 : 1;
for( x = x1*D; x < x2*D; x++ )
@ -1008,26 +982,19 @@ struct CalcVerticalSums: public ParallelLoopBody
}
// also, clear the S buffer
for( k = x1*D; k < x2*D; k++ ) //4e: only on first pass, so it keep old information, don't be confused
for( k = x1*D; k < x2*D; k++ )
S[k] = 0;
}
// [formula 13 in the paper]
// compute L_r(p, d) = C(p, d) +
// min(L_r(p-r, d),
// L_r(p-r, d-1) + P1,
// L_r(p-r, d+1) + P1,
// min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
// where p = (x,y), r is one of the directions.
// we process all the directions at once:
// 0: r=(-dx, 0)
// 1: r=(-1, -dy)
// 2: r=(0, -dy)
// 3: r=(1, -dy)
// 4: r=(-2, -dy)
// 5: r=(-1, -dy*2)
// 6: r=(1, -dy*2)
// 7: r=(2, -dy)
// [formula 13 in the paper]
// compute L_r(p, d) = C(p, d) +
// min(L_r(p-r, d),
// L_r(p-r, d-1) + P1,
// L_r(p-r, d+1) + P1,
// min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
// where p = (x,y), r is one of the directions.
// we process one directions on first pass and other on second:
// r=(0, dy), where dy=1 on first pass and dy=-1 on second
for( x = x1; x != x2; x++ )
{
@ -1043,88 +1010,12 @@ struct CalcVerticalSums: public ParallelLoopBody
const CostType* Cp = C + x*D;
CostType* Sp = S + x*D;
// #if CV_SIMD128
// if( useSIMD )
// {
// v_int16x8 _P1 = v_setall_s16((short)P1);
//
// v_int16x8 _delta0 = v_setall_s16((short)delta0);
// v_int16x8 _delta1 = v_setall_s16((short)delta1);
// v_int16x8 _delta2 = v_setall_s16((short)delta2);
// v_int16x8 _delta3 = v_setall_s16((short)delta3);
// v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
//
// for( d = 0; d < D; d += 8 )
// {
// v_int16x8 Cpd = v_load(Cp + d);
// v_int16x8 L0, L1, L2, L3;
//
// L0 = v_load(Lr_p0 + d);
// L1 = v_load(Lr_p1 + d);
// L2 = v_load(Lr_ppr + d);
// L3 = v_load(Lr_p3 + d);
//
// L0 = v_min(L0, (v_load(Lr_p0 + d - 1) + _P1));
// L0 = v_min(L0, (v_load(Lr_p0 + d + 1) + _P1));
//
// L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
// L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
//
// L2 = v_min(L2, (v_load(Lr_ppr + d - 1) + _P1));
// L2 = v_min(L2, (v_load(Lr_ppr + d + 1) + _P1));
//
// L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
// L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
//
// L0 = v_min(L0, _delta0);
// L0 = ((L0 - _delta0) + Cpd);
//
// L1 = v_min(L1, _delta1);
// L1 = ((L1 - _delta1) + Cpd);
//
// L2 = v_min(L2, _delta2);
// L2 = ((L2 - _delta2) + Cpd);
//
// L3 = v_min(L3, _delta3);
// L3 = ((L3 - _delta3) + Cpd);
//
// v_store(Lr_p + d, L0);
// v_store(Lr_p + d + D2, L1);
// v_store(Lr_p + d + D2*2, L2);
// v_store(Lr_p + d + D2*3, L3);
//
// // Get minimum from in L0-L3
// v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
// v_zip(L0, L2, t02L, t02H); // L0[0] L2[0] L0[1] L2[1]...
// v_zip(L1, L3, t13L, t13H); // L1[0] L3[0] L1[1] L3[1]...
// v_int16x8 t02 = v_min(t02L, t02H); // L0[i] L2[i] L0[i] L2[i]...
// v_int16x8 t13 = v_min(t13L, t13H); // L1[i] L3[i] L1[i] L3[i]...
// v_zip(t02, t13, t0123L, t0123H); // L0[i] L1[i] L2[i] L3[i]...
// v_int16x8 t0 = v_min(t0123L, t0123H);
// _minL0 = v_min(_minL0, t0);
//
// v_int16x8 Sval = v_load(Sp + d);
//
// L0 = L0 + L1;
// L2 = L2 + L3;
// Sval = Sval + L0;
// Sval = Sval + L2;
//
// v_store(Sp + d, Sval);
// }
//
// v_int32x4 minL, minH;
// v_expand(_minL0, minL, minH);
// v_pack_store(&minLr[0][x], v_min(minL, minH));
// }
// else
// #endif
{
int minL = MAX_COST;
for( d = 0; d < D; d++ )
{
int Cpd = Cp[d], L; //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
int Cpd = Cp[d], L;
L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
@ -1144,7 +1035,6 @@ struct CalcVerticalSums: public ParallelLoopBody
}
}
static const int NLR = 2;
static const int LrBorder = NLR - 1;
const Mat& img1;
const Mat& img2;
CostType* Cbuf;
@ -1178,7 +1068,7 @@ struct CalcHorizontalSums: public ParallelLoopBody
minD = params.minDisparity;
maxD = minD + params.numDisparities;
P1 = params.P1 > 0 ? params.P1 : 2;
P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1); //TODO: think about P1/S(x,y) Proportion
P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
height = img1.rows;
@ -1192,7 +1082,7 @@ struct CalcHorizontalSums: public ParallelLoopBody
costBufSize = width1*D;
CSBufSize = costBufSize*height;
D2 = D + 16;
LrSize = 2 * D2; //TODO: Check: do we need this value or not?
LrSize = 2 * D2;
Cbuf = alignedBuf;
Sbuf = Cbuf + CSBufSize;
}
@ -1200,13 +1090,11 @@ struct CalcHorizontalSums: public ParallelLoopBody
void operator()( const Range& range ) const
{
int y1 = range.start, y2 = range.end;
static const CostType MAX_COST = SHRT_MAX;
static const int ALIGN = 16;
size_t auxBufsSize = LrSize + width*(sizeof(CostType) + sizeof(DispType)) + 32;
size_t auxBufsSize = LrSize * sizeof(CostType) + width*(sizeof(CostType) + sizeof(DispType)) + 32;
Mat auxBuff;
auxBuff.create(1, (int)auxBufsSize, CV_8U);
CostType *Lr = (CostType*)alignPtr(auxBuff.ptr(), ALIGN) + 8;
CostType *Lr = ((CostType*)alignPtr(auxBuff.ptr(), ALIGN)) + 8;
CostType* disp2cost = Lr + LrSize;
DispType* disp2ptr = (DispType*)(disp2cost + width);
@ -1227,132 +1115,48 @@ struct CalcHorizontalSums: public ParallelLoopBody
// clear buffers
memset( Lr - 8, 0, LrSize*sizeof(CostType) );
Lr[-1] = Lr[D] = Lr[D2 - 1] = Lr[D2 + D] = MAX_COST;
minLr = 0;
/*
[formula 13 in the paper]
compute L_r(p, d) = C(p, d) +
min(L_r(p-r, d),
L_r(p-r, d-1) + P1,
L_r(p-r, d+1) + P1,
min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
where p = (x,y), r is one of the directions.
we process all the directions at once:
0: r=(-dx, 0)
1: r=(-1, -dy)
2: r=(0, -dy)
3: r=(1, -dy)
4: r=(-2, -dy)
5: r=(-1, -dy*2)
6: r=(1, -dy*2)
7: r=(2, -dy)
*/
// [formula 13 in the paper]
// compute L_r(p, d) = C(p, d) +
// min(L_r(p-r, d),
// L_r(p-r, d-1) + P1,
// L_r(p-r, d+1) + P1,
// min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
// where p = (x,y), r is one of the directions.
// we process all the directions at once:
// we process one directions on first pass and other on second:
// r=(dx, 0), where dx=1 on first pass and dx=-1 on second
for( x = 0; x != width1; x++)
{
int delta = minLr + P2;
CostType* Lr_ppr = Lr + ((x&1)? 0 : D2);
Lr_ppr[-1] = Lr_ppr[D] = MAX_COST; //TODO: Well, probably, it's better do this once before.
CostType* Lr_p = Lr + ((x&1)? D2 :0);
const CostType* Cp = C + x*D;
CostType* Sp = S + x*D;
// #if CV_SIMD128
// if( useSIMD )
// {
// v_int16x8 _P1 = v_setall_s16((short)P1);
//
// v_int16x8 _delta0 = v_setall_s16((short)delta0);
// v_int16x8 _delta1 = v_setall_s16((short)delta1);
// v_int16x8 _delta2 = v_setall_s16((short)delta2);
// v_int16x8 _delta3 = v_setall_s16((short)delta3);
// v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
//
// for( d = 0; d < D; d += 8 )
// {
// v_int16x8 Cpd = v_load(Cp + d);
// v_int16x8 L0, L1, L2, L3;
//
// L0 = v_load(Lr_ppr + d);
// L1 = v_load(Lr_p1 + d);
// L2 = v_load(Lr_p2 + d);
// L3 = v_load(Lr_p3 + d);
//
// L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1));
// L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1));
//
// L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
// L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
//
// L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1));
// L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1));
//
// L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
// L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
//
// L0 = v_min(L0, _delta0);
// L0 = ((L0 - _delta0) + Cpd);
//
// L1 = v_min(L1, _delta1);
// L1 = ((L1 - _delta1) + Cpd);
//
// L2 = v_min(L2, _delta2);
// L2 = ((L2 - _delta2) + Cpd);
//
// L3 = v_min(L3, _delta3);
// L3 = ((L3 - _delta3) + Cpd);
//
// v_store(Lr_p + d, L0);
// v_store(Lr_p + d + D2, L1);
// v_store(Lr_p + d + D2*2, L2);
// v_store(Lr_p + d + D2*3, L3);
//
// // Get minimum from in L0-L3
// v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
// v_zip(L0, L2, t02L, t02H); // L0[0] L2[0] L0[1] L2[1]...
// v_zip(L1, L3, t13L, t13H); // L1[0] L3[0] L1[1] L3[1]...
// v_int16x8 t02 = v_min(t02L, t02H); // L0[i] L2[i] L0[i] L2[i]...
// v_int16x8 t13 = v_min(t13L, t13H); // L1[i] L3[i] L1[i] L3[i]...
// v_zip(t02, t13, t0123L, t0123H); // L0[i] L1[i] L2[i] L3[i]...
// v_int16x8 t0 = v_min(t0123L, t0123H);
// _minL0 = v_min(_minL0, t0);
//
// v_int16x8 Sval = v_load(Sp + d);
//
// L0 = L0 + L1;
// L2 = L2 + L3;
// Sval = Sval + L0;
// Sval = Sval + L2;
//
// v_store(Sp + d, Sval);
// }
//
// v_int32x4 minL, minH;
// v_expand(_minL0, minL, minH);
// v_pack_store(&minLr[x], v_min(minL, minH));
// }
// else
// #endif
int minL = MAX_COST;
for( d = 0; d < D; d++ )
{
int minL = MAX_COST;
int Cpd = Cp[d], L;
for( d = 0; d < D; d++ )
{
int Cpd = Cp[d], L; //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
Lr_p[d] = (CostType)L;
minL = std::min(minL, L);
Lr_p[d] = (CostType)L;
minL = std::min(minL, L);
Sp[d] = saturate_cast<CostType>(Sp[d] + L);
}
minLr = (CostType)minL;
Sp[d] = saturate_cast<CostType>(Sp[d] + L);
}
minLr = (CostType)minL;
}
memset( Lr - 8, 0, LrSize*sizeof(CostType) );
Lr[-1] = Lr[D] = Lr[D2 - 1] = Lr[D2 + D] = MAX_COST;
minLr = 0;
for( x = width1-1; x != -1; x--)
@ -1361,136 +1165,30 @@ struct CalcHorizontalSums: public ParallelLoopBody
CostType* Lr_ppr = Lr + ((x&1)? 0 :D2);
Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
CostType* Lr_p = Lr + ((x&1)? D2 :0);
const CostType* Cp = C + x*D;
CostType* Sp = S + x*D;
int minS = MAX_COST, bestDisp = -1;
// #if CV_SIMD128
// if( useSIMD )
// {
// v_int16x8 _P1 = v_setall_s16((short)P1);
//
// v_int16x8 _delta0 = v_setall_s16((short)delta0);
// v_int16x8 _delta1 = v_setall_s16((short)delta1);
// v_int16x8 _delta2 = v_setall_s16((short)delta2);
// v_int16x8 _delta3 = v_setall_s16((short)delta3);
// v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
//
// for( d = 0; d < D; d += 8 )
// {
// v_int16x8 Cpd = v_load(Cp + d);
// v_int16x8 L0, L1, L2, L3;
//
// L0 = v_load(Lr_ppr + d);
// L1 = v_load(Lr_p1 + d);
// L2 = v_load(Lr_p2 + d);
// L3 = v_load(Lr_p3 + d);
//
// L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1));
// L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1));
//
// L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
// L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
//
// L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1));
// L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1));
//
// L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
// L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
//
// L0 = v_min(L0, _delta0);
// L0 = ((L0 - _delta0) + Cpd);
//
// L1 = v_min(L1, _delta1);
// L1 = ((L1 - _delta1) + Cpd);
//
// L2 = v_min(L2, _delta2);
// L2 = ((L2 - _delta2) + Cpd);
//
// L3 = v_min(L3, _delta3);
// L3 = ((L3 - _delta3) + Cpd);
//
// v_store(Lr_p + d, L0);
// v_store(Lr_p + d + D2, L1);
// v_store(Lr_p + d + D2*2, L2);
// v_store(Lr_p + d + D2*3, L3);
//
// // Get minimum from in L0-L3
// v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
// v_zip(L0, L2, t02L, t02H); // L0[0] L2[0] L0[1] L2[1]...
// v_zip(L1, L3, t13L, t13H); // L1[0] L3[0] L1[1] L3[1]...
// v_int16x8 t02 = v_min(t02L, t02H); // L0[i] L2[i] L0[i] L2[i]...
// v_int16x8 t13 = v_min(t13L, t13H); // L1[i] L3[i] L1[i] L3[i]...
// v_zip(t02, t13, t0123L, t0123H); // L0[i] L1[i] L2[i] L3[i]...
// v_int16x8 t0 = v_min(t0123L, t0123H);
// _minL0 = v_min(_minL0, t0);
//
// v_int16x8 Sval = v_load(Sp + d);
//
// L0 = L0 + L1;
// L2 = L2 + L3;
// Sval = Sval + L0;
// Sval = Sval + L2;
//
// v_store(Sp + d, Sval);
// }
//
// v_int32x4 minL, minH;
// v_expand(_minL0, minL, minH);
// v_pack_store(&minLr[x], v_min(minL, minH));
// }
// else
// #endif
//TODO:Next piece of code is came from postprocessing. Be very careful with joining them!!!
// #if CV_SIMD128
// if( useSIMD )
// {
// v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);
// v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);
//
// for( d = 0; d < D; d+= 8 )
// {
// v_int16x8 L0 = v_load(Sp + d);
// v_int16x8 mask = L0 < _minS;
// _minS = v_min( L0, _minS );
// _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
// _d8 = _d8 + _8;
// }
// v_int32x4 _d0, _d1;
// v_expand(_minS, _d0, _d1);
// minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
// v_int16x8 v_mask = v_setall_s16((short)minS) == _minS;
//
// _bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);
// v_expand(_bestDisp, _d0, _d1);
// bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
// }
// else
// #endif
int minL = MAX_COST;
for( d = 0; d < D; d++ )
{
int minL = MAX_COST;
int Cpd = Cp[d], L;
for( d = 0; d < D; d++ )
L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
Lr_p[d] = (CostType)L;
minL = std::min(minL, L);
Sp[d] = saturate_cast<CostType>(Sp[d] + L);
if( Sp[d] < minS )
{
int Cpd = Cp[d], L; //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
Lr_p[d] = (CostType)L;
minL = std::min(minL, L);
Sp[d] = saturate_cast<CostType>(Sp[d] + L);
if( Sp[d] < minS )
{
minS = Sp[d];
bestDisp = d;
}
minS = Sp[d];
bestDisp = d;
}
minLr = (CostType)minL;
}
minLr = (CostType)minL;
//Some postprocessing procedures and saving
for( d = 0; d < D; d++ )
{
@ -1531,17 +1229,17 @@ struct CalcHorizontalSums: public ParallelLoopBody
int _d = d1 >> DISP_SHIFT;
int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
int _x = x - _d, x_ = x - d_;
if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff && //4e: To dismiss disparity, we should assure, that there is no any
0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff ) //4e: chance to understand this as correct.
if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
}
}
}
static const int NLR = 2;
static const int LrBorder = NLR - 1;
static const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
static const int DISP_SCALE = (1 << DISP_SHIFT);
static const CostType MAX_COST = SHRT_MAX;
static const int ALIGN = 16;
const Mat& img1;
const Mat& img2;
Mat& disp1;
@ -1567,68 +1265,41 @@ struct CalcHorizontalSums: public ParallelLoopBody
int disp12MaxDiff;
};
/*
This is new experimential version of disparity calculation, which should be parralled after
TODO: Don't forget to rewrire this commentaries after
computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
minD <= d < maxD.
disp2full is the reverse disparity map, that is:
disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
note that disp1buf will have the same size as the roi and
disp2full will have the same size as img1 (or img2).
On exit disp2buf is not the final disparity, it is an intermediate result that becomes
On exit disp1buf is not the final disparity, it is an intermediate result that becomes
final after all the tiles are processed.
the disparity in disp1buf is written with sub-pixel accuracy
(4 fractional bits, see StereoSGBM::DISP_SCALE),
using quadratic interpolation, while the disparity in disp2buf
is written as is, without interpolation.
disp2cost also has the same size as img1 (or img2).
It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
*/
static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
Mat& disp1, const StereoSGBMParams& params,
Mat& buffer )
{
//#if CV_SIMD128
// // maxDisparity is supposed to multiple of 16, so we can forget doing else
// static const uchar LSBTab[] =
// {
// 0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
// 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
// 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
// 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
// 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
// 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
// 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
// 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
// };
// static const v_uint16x8 v_LSB = v_uint16x8(0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
//
// bool useSIMD = hasSIMD128();
//#endif
const int ALIGN = 16;
const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
const int DISP_SCALE = (1 << DISP_SHIFT);
int minD = params.minDisparity, maxD = minD + params.numDisparities;
Size SADWindowSize; //4e: SAD means Sum of Absolute Differences
SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5; //4e: and this is always square
int ftzero = std::max(params.preFilterCap, 15) | 1; //4e:ftzero clips x-derivatives. I think, this story with arrays is about non-realized SIMD method
int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1); //TODO: think about P1/S(x,y) Proportion
Size SADWindowSize;
SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
int ftzero = std::max(params.preFilterCap, 15) | 1;
int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
int k, width = disp1.cols, height = disp1.rows;
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
int D = maxD - minD, width1 = maxX1 - minX1;
int SH2 = SADWindowSize.height/2;
int INVALID_DISP = minD - 1;
int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2; //4e: array is such big due to derivative could be +-8*256 in worst cases
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
PixType clipTab[TAB_SIZE];
for( k = 0; k < TAB_SIZE; k++ ) //4e: If ftzero would = 4, array containment will be = -4 -4 -4 ... -4 -3 -2 -1 0 1 2 3 4 ... 4 4 4
for( k = 0; k < TAB_SIZE; k++ )
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
if( minX1 >= maxX1 )
@ -1637,28 +1308,25 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
return;
}
CV_Assert( D % 16 == 0 ); //TODO: Are you sure? By the way, why not 8?
CV_Assert( D % 16 == 0 );
// NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
// if you change NR, please, modify the loop as well.
int D2 = D+16; //4e: Somewhere in code we need d+1, so D+1. One of simplest solutuons is increasing D-dimension on 1. But 1 is 16, when storage should be aligned.
int D2 = D+16;
// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
// for 8-way dynamic programming we need the current row and
// for dynamic programming we need the current row and
// the previous row, i.e. 2 rows in total
const int NLR = 2; //4e: We assume, that we need one or more previous steps in our linear dynamic(one right here).
const int LrBorder = NLR - 1; //4e: for simplification of calculations we need border for taking previous dynamic solutions.
const int NLR = 2;
// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
// we keep pixel difference cost (C) and the summary cost over NR directions (S).
// we keep pixel difference cost (C) and the summary cost over 4 directions (S).
// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
size_t costBufSize = width1*D;
size_t CSBufSize = costBufSize*height;
size_t minLrSize = (width1 + LrBorder*2), LrSize = minLrSize*D2; //TODO: We don't need LrBorder for vertical passes and we don't need Lr buffer for horizontal passes.
size_t minLrSize = width1 , LrSize = minLrSize*D2;
int hsumBufNRows = SH2*2 + 2;
size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
costBufSize*hsumBufNRows*sizeof(CostType) + // hsumBuf //4e: TODO: Why we should increase sum window height one more time?
CSBufSize*2*sizeof(CostType) + 1024; // C, S //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size
costBufSize*hsumBufNRows*sizeof(CostType) + // hsumBuf
CSBufSize*2*sizeof(CostType) + 1024; // C, S
if( buffer.empty() || !buffer.isContinuous() ||
buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
@ -1671,8 +1339,8 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
for(k = 0; k < (int)CSBufSize; k++ )
Cbuf[k] = (CostType)P2;
parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, Cbuf, clipTab));
parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, Cbuf));
parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, Cbuf, clipTab),8);
parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, Cbuf),8);
}
@ -2331,7 +1999,7 @@ public:
if(params.mode==MODE_SGBM_3WAY)
computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );
else if(params.mode==MODE_HH4)
computeDisparitySGBMParallel( left, right, disp, params, buffer );
computeDisparitySGBM_HH4( left, right, disp, params, buffer );
else
computeDisparitySGBM( left, right, disp, params, buffer );

View File

@ -785,37 +785,22 @@ protected:
TEST(Calib3d_StereoBM, regression) { CV_StereoBMTest test; test.safe_run(); }
TEST(Calib3d_StereoSGBM, regression) { CV_StereoSGBMTest test; test.safe_run(); }
TEST(Calib3d_StereoSGBMPar, idontknowhowtotesthere)
TEST(Calib3d_StereoSGBM_HH4, regression)
{
// <!-- caseName, datasetName, numDisp, winSize, mode -->
// case_teddy_2 teddy "48" "3" "MODE_HH"
//Ptr<StereoSGBM> StereoSGBM::create(int minDisparity, int numDisparities, int SADWindowSize,
// int P1, int P2, int disp12MaxDiff,
// int preFilterCap, int uniquenessRatio,
// int speckleWindowSize, int speckleRange,
// int mode)
Mat leftImg = imread("/home/q/Work/GitVault/opencv_extra/testdata/cv/stereomatching/datasets/teddy/im2.png");
Mat rightImg = imread("/home/q/Work/GitVault/opencv_extra/testdata/cv/stereomatching/datasets/teddy/im6.png");
// Mat leftDisp_old, leftDisp_new;
String path = cvtest::TS::ptr()->get_data_path() + "cv/stereomatching/datasets/teddy/";
Mat leftImg = imread(path + "im2.png", 0);
Mat rightImg = imread(path + "im6.png", 0);
Mat testData = imread(path + "disp2_hh4.png",-1);
Mat leftDisp;
Mat toCheck;
{
Mat leftDisp;
Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH);
sgbm->compute( leftImg, rightImg, leftDisp);
CV_Assert( leftDisp.type() == CV_16SC1 );
leftDisp/=8;
imwrite( "/home/q/Work/GitVault/modehh4_new.jpg", leftDisp);
}
{
Mat leftDisp;
Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH4);
sgbm->compute( leftImg, rightImg, leftDisp);
CV_Assert( leftDisp.type() == CV_16SC1 );
leftDisp/=8;
imwrite( "/home/q/Work/GitVault/modehh4_old.jpg", leftDisp);
leftDisp.convertTo(toCheck, CV_16UC1);
// imwrite("/home/q/Work/GitVault/disp2_hh4.png",toCheck);
}
// Mat diff;
// absdiff(leftDisp_old,leftDisp_new,diff);
// CV_Assert( countNonZero(diff)==0);
//
Mat diff;
absdiff(toCheck, testData,diff);
CV_Assert( countNonZero(diff)==0);
}