mirror of
https://github.com/opencv/opencv.git
synced 2025-06-11 11:45:30 +08:00
First occurence of 4-directional version of SGBM.
Even without any tests. Next step is parallelising it.
This commit is contained in:
parent
072f873df6
commit
2918c3d75a
@ -1809,7 +1809,8 @@ public:
|
||||
{
|
||||
MODE_SGBM = 0,
|
||||
MODE_HH = 1,
|
||||
MODE_SGBM_3WAY = 2
|
||||
MODE_SGBM_3WAY = 2,
|
||||
MODE_HH4 = 1
|
||||
};
|
||||
|
||||
CV_WRAP virtual int getPreFilterCap() const = 0;
|
||||
|
@ -301,9 +301,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
const CostType MAX_COST = SHRT_MAX;
|
||||
|
||||
int minD = params.minDisparity, maxD = minD + params.numDisparities;
|
||||
Size SADWindowSize;
|
||||
SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
|
||||
int ftzero = std::max(params.preFilterCap, 15) | 1;
|
||||
Size SADWindowSize; //4e: SAD means Sum of Absolute Differences
|
||||
SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5; //4e: and this is always square
|
||||
int ftzero = std::max(params.preFilterCap, 15) | 1; //4e:ftzero clips x-derivatives. I think, this story with arrays is about non-realized SIMD method
|
||||
int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
|
||||
int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
|
||||
int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
|
||||
@ -313,11 +313,11 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
|
||||
int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
|
||||
bool fullDP = params.mode == StereoSGBM::MODE_HH;
|
||||
int npasses = fullDP ? 2 : 1;
|
||||
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
|
||||
int npasses = fullDP ? 2 : 1; //4e: 2 passes with different behaviour for Hirshmueller method.
|
||||
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2; //4e: array is such big due to derivative could be +-8*256 in worst cases
|
||||
PixType clipTab[TAB_SIZE];
|
||||
|
||||
for( k = 0; k < TAB_SIZE; k++ )
|
||||
for( k = 0; k < TAB_SIZE; k++ ) //4e: If ftzero would = 4, array containment will be = -4 -4 -4 ... -4 -3 -2 -1 0 1 2 3 4 ... 4 4 4
|
||||
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
|
||||
|
||||
if( minX1 >= maxX1 )
|
||||
@ -330,25 +330,25 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
|
||||
// NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
|
||||
// if you change NR, please, modify the loop as well.
|
||||
int D2 = D+16, NRD2 = NR2*D2;
|
||||
int D2 = D+16, NRD2 = NR2*D2; //4e: Somewhere in code we need d+1, so D+1. One of simplest solutuons is increasing D-dimension on 1. But 1 is 16, when storage should be aligned.
|
||||
|
||||
// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
|
||||
// for 8-way dynamic programming we need the current row and
|
||||
// the previous row, i.e. 2 rows in total
|
||||
const int NLR = 2;
|
||||
const int LrBorder = NLR - 1;
|
||||
const int NLR = 2; //4e: We assume, that we need one or more previous steps in our linear dynamic(one right here).
|
||||
const int LrBorder = NLR - 1; //4e: for simplification of calculations we need border for taking previous dynamic solutions.
|
||||
|
||||
// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
|
||||
// we keep pixel difference cost (C) and the summary cost over NR directions (S).
|
||||
// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
|
||||
size_t costBufSize = width1*D;
|
||||
size_t CSBufSize = costBufSize*(fullDP ? height : 1);
|
||||
size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
|
||||
size_t CSBufSize = costBufSize*(fullDP ? height : 1); //4e: For HH mode it's better to keep whole array of costs.
|
||||
size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2; //4e: TODO: Understand why NR2 per pass instead od NR2/2 (Probably, without any reason. That doesn't make code wrong)
|
||||
int hsumBufNRows = SH2*2 + 2;
|
||||
size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
|
||||
costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
|
||||
CSBufSize*2*sizeof(CostType) + // C, S
|
||||
width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
|
||||
costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff //4e: TODO: Why we should increase sum window height one more time?
|
||||
CSBufSize*2*sizeof(CostType) + // C, S //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size
|
||||
width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost //4e: It is needed for calcPixelCostBT function, as "buffer" value
|
||||
width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
|
||||
|
||||
if( buffer.empty() || !buffer.isContinuous() ||
|
||||
@ -361,7 +361,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
CostType* hsumBuf = Sbuf + CSBufSize;
|
||||
CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
|
||||
|
||||
CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
|
||||
CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR; //4e: It is containers for backwards disparity, made by S[d] too, but with other method
|
||||
DispType* disp2ptr = (DispType*)(disp2cost + width);
|
||||
PixType* tempBuf = (PixType*)(disp2ptr + width);
|
||||
|
||||
@ -373,21 +373,21 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
{
|
||||
int x1, y1, x2, y2, dx, dy;
|
||||
|
||||
if( pass == 1 )
|
||||
if( pass == 1 ) //4e: on the first pass, we work down, so calculate directions down, diagdown and right
|
||||
{
|
||||
y1 = 0; y2 = height; dy = 1;
|
||||
x1 = 0; x2 = width1; dx = 1;
|
||||
}
|
||||
else
|
||||
else //4e: on the second pass, we work up, so calculate directions up, diagup and up
|
||||
{
|
||||
y1 = height-1; y2 = -1; dy = -1;
|
||||
x1 = width1-1; x2 = -1; dx = -1;
|
||||
}
|
||||
|
||||
CostType *Lr[NLR]={0}, *minLr[NLR]={0};
|
||||
CostType *Lr[NLR]={0}, *minLr[NLR]={0}; //4e: arrays for L(x,y,r,d) of previous and current rows and minimums of them
|
||||
|
||||
for( k = 0; k < NLR; k++ )
|
||||
{
|
||||
for( k = 0; k < NLR; k++ ) //4e: One of them is needed, and one of them is stored. So, we need to swap pointer
|
||||
{ //4e: Yes, and this is done at the end of next cycle, not here.
|
||||
// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
|
||||
// and will occasionally use negative indices with the arrays
|
||||
// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
|
||||
@ -408,28 +408,28 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
|
||||
if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
|
||||
{
|
||||
int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
|
||||
int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1; //4e: for first line's block sum we need calculate half-window of costs and only one for other
|
||||
|
||||
for( k = dy1; k <= dy2; k++ )
|
||||
{
|
||||
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
|
||||
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize; //4e: Ring buffer for horizontally summed lines
|
||||
|
||||
if( k < height )
|
||||
{
|
||||
calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
|
||||
|
||||
memset(hsumAdd, 0, D*sizeof(CostType));
|
||||
for( x = 0; x <= SW2*D; x += D )
|
||||
for( x = 0; x <= SW2*D; x += D ) //4e: Calculation summed costs for all disparities in first pixel of line
|
||||
{
|
||||
int scale = x == 0 ? SW2 + 1 : 1;
|
||||
for( d = 0; d < D; d++ )
|
||||
hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
|
||||
}
|
||||
|
||||
if( y > 0 )
|
||||
{
|
||||
if( y > 0 ) //4e: We calculate horizontal sums and forming full block sums for y coord by adding this horsums to previous line's sums and subtracting stored lowest
|
||||
{ //4e: horsum in hsumBuf. Exception is case y=0, where we need many iterations per lines to create full blocking sum.
|
||||
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
|
||||
const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
|
||||
const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize; //4e: Well, actually y>0, so we don't need this check: y==0
|
||||
|
||||
for( x = D; x < width1*D; x += D )
|
||||
{
|
||||
@ -465,8 +465,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
}
|
||||
else
|
||||
{
|
||||
for( x = D; x < width1*D; x += D )
|
||||
{
|
||||
for( x = D; x < width1*D; x += D ) //4e: Calcluates horizontal sums if (y==0). This piece of code is calling SH2+1 times and then result is used in different way
|
||||
{ //4e: to create full blocks sum. That's why this code is isolated from upper case.
|
||||
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
|
||||
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
|
||||
|
||||
@ -476,7 +476,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
}
|
||||
}
|
||||
|
||||
if( y == 0 )
|
||||
if( y == 0 ) //4e: Calculating first full block sum.
|
||||
{
|
||||
int scale = k == 0 ? SH2 + 1 : 1;
|
||||
for( x = 0; x < width1*D; x++ )
|
||||
@ -485,13 +485,13 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
}
|
||||
|
||||
// also, clear the S buffer
|
||||
for( k = 0; k < width1*D; k++ )
|
||||
for( k = 0; k < width1*D; k++ ) //4e: only on first pass, so it keep old information, don't be confused
|
||||
S[k] = 0;
|
||||
}
|
||||
|
||||
// clear the left and the right borders
|
||||
memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
|
||||
memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
|
||||
memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) ); //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory
|
||||
memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) ); //4e: ...00000000|NRD2-16 of real costs value(and some of them are zeroes too)|00000000...
|
||||
memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
|
||||
memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
|
||||
|
||||
@ -613,7 +613,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
|
||||
for( d = 0; d < D; d++ )
|
||||
{
|
||||
int Cpd = Cp[d], L0, L1, L2, L3;
|
||||
int Cpd = Cp[d], L0, L1, L2, L3; //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
|
||||
|
||||
L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
|
||||
L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;
|
||||
@ -654,7 +654,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
CostType* Sp = S + x*D;
|
||||
int minS = MAX_COST, bestDisp = -1;
|
||||
|
||||
if( npasses == 1 )
|
||||
if( npasses == 1 ) //4e: in this case we could take fifth direction almost for free(direction "left")
|
||||
{
|
||||
int xm = x*NR2, xd = xm*D2;
|
||||
|
||||
@ -804,7 +804,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
|
||||
}
|
||||
|
||||
for( x = minX1; x < maxX1; x++ )
|
||||
for( x = minX1; x < maxX1; x++ ) //4e: Left-right check itself
|
||||
{
|
||||
// we round the computed disparity both towards -inf and +inf and check
|
||||
// if either of the corresponding disparities in disp2 is consistent.
|
||||
@ -815,8 +815,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
int _d = d1 >> DISP_SHIFT;
|
||||
int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
|
||||
int _x = x - _d, x_ = x - d_;
|
||||
if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
|
||||
0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
|
||||
if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff && //4e: To dismiss disparity, we should assure, that there is no any
|
||||
0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff ) //4e: chance to understand this as correct.
|
||||
disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
|
||||
}
|
||||
}
|
||||
@ -828,8 +828,732 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/*
|
||||
This is new experimential version of disparity calculation, which should be parralled after
|
||||
TODO: Don't forget to rewrire this commentaries after
|
||||
|
||||
computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
|
||||
that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
|
||||
minD <= d < maxD.
|
||||
disp2full is the reverse disparity map, that is:
|
||||
disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
|
||||
|
||||
note that disp1buf will have the same size as the roi and
|
||||
disp2full will have the same size as img1 (or img2).
|
||||
On exit disp2buf is not the final disparity, it is an intermediate result that becomes
|
||||
final after all the tiles are processed.
|
||||
|
||||
the disparity in disp1buf is written with sub-pixel accuracy
|
||||
(4 fractional bits, see StereoSGBM::DISP_SCALE),
|
||||
using quadratic interpolation, while the disparity in disp2buf
|
||||
is written as is, without interpolation.
|
||||
|
||||
disp2cost also has the same size as img1 (or img2).
|
||||
It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
|
||||
*/
|
||||
static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
|
||||
Mat& disp1, const StereoSGBMParams& params,
|
||||
Mat& buffer )
|
||||
{
|
||||
//#if CV_SIMD128
|
||||
// // maxDisparity is supposed to multiple of 16, so we can forget doing else
|
||||
// static const uchar LSBTab[] =
|
||||
// {
|
||||
// 0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
// 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
// 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
// 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
// 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
// 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
// 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
// 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
|
||||
// };
|
||||
// static const v_uint16x8 v_LSB = v_uint16x8(0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
|
||||
//
|
||||
// bool useSIMD = hasSIMD128();
|
||||
//#endif
|
||||
|
||||
const int ALIGN = 16;
|
||||
const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
|
||||
const int DISP_SCALE = (1 << DISP_SHIFT);
|
||||
const CostType MAX_COST = SHRT_MAX;
|
||||
|
||||
int minD = params.minDisparity, maxD = minD + params.numDisparities;
|
||||
Size SADWindowSize; //4e: SAD means Sum of Absolute Differences
|
||||
SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5; //4e: and this is always square
|
||||
int ftzero = std::max(params.preFilterCap, 15) | 1; //4e:ftzero clips x-derivatives. I think, this story with arrays is about non-realized SIMD method
|
||||
int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
|
||||
int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
|
||||
int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1); //TODO: think about P1/S(x,y) Proportion
|
||||
int k, width = disp1.cols, height = disp1.rows;
|
||||
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
|
||||
int D = maxD - minD, width1 = maxX1 - minX1;
|
||||
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
|
||||
int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
|
||||
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2; //4e: array is such big due to derivative could be +-8*256 in worst cases
|
||||
PixType clipTab[TAB_SIZE];
|
||||
|
||||
for( k = 0; k < TAB_SIZE; k++ ) //4e: If ftzero would = 4, array containment will be = -4 -4 -4 ... -4 -3 -2 -1 0 1 2 3 4 ... 4 4 4
|
||||
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
|
||||
|
||||
if( minX1 >= maxX1 )
|
||||
{
|
||||
disp1 = Scalar::all(INVALID_DISP_SCALED);
|
||||
return;
|
||||
}
|
||||
|
||||
CV_Assert( D % 16 == 0 ); //TODO: Are you sure? By the way, why not 8?
|
||||
|
||||
// NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
|
||||
// if you change NR, please, modify the loop as well.
|
||||
int D2 = D+16; //4e: Somewhere in code we need d+1, so D+1. One of simplest solutuons is increasing D-dimension on 1. But 1 is 16, when storage should be aligned.
|
||||
|
||||
// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
|
||||
// for 8-way dynamic programming we need the current row and
|
||||
// the previous row, i.e. 2 rows in total
|
||||
const int NLR = 2; //4e: We assume, that we need one or more previous steps in our linear dynamic(one right here).
|
||||
const int LrBorder = NLR - 1; //4e: for simplification of calculations we need border for taking previous dynamic solutions.
|
||||
|
||||
// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
|
||||
// we keep pixel difference cost (C) and the summary cost over NR directions (S).
|
||||
// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
|
||||
size_t costBufSize = width1*D;
|
||||
size_t CSBufSize = costBufSize*height; //4e: For HH mode it's better to keep whole array of costs.
|
||||
size_t minLrSize = (width1 + LrBorder*2), LrSize = minLrSize*D2; //4e: TODO: Understand why NR2 per pass instead od NR2/2 (Probably, without any reason. That doesn't make code wrong)
|
||||
int hsumBufNRows = SH2*2 + 2;
|
||||
size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
|
||||
costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff //4e: TODO: Why we should increase sum window height one more time?
|
||||
CSBufSize*2*sizeof(CostType) + // C, S //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size
|
||||
width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost //4e: It is needed for calcPixelCostBT function, as "buffer" value
|
||||
width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
|
||||
|
||||
if( buffer.empty() || !buffer.isContinuous() ||
|
||||
buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
|
||||
buffer.create(1, (int)totalBufSize, CV_8U);
|
||||
|
||||
// summary cost over different (nDirs) directions
|
||||
CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
|
||||
CostType* Sbuf = Cbuf + CSBufSize;
|
||||
CostType* hsumBuf = Sbuf + CSBufSize;
|
||||
CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
|
||||
|
||||
CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR; //4e: It is containers for backwards disparity, made by S[d] too, but with other method
|
||||
DispType* disp2ptr = (DispType*)(disp2cost + width);
|
||||
PixType* tempBuf = (PixType*)(disp2ptr + width);
|
||||
|
||||
// add P2 to every C(x,y). it saves a few operations in the inner loops
|
||||
for(k = 0; k < (int)CSBufSize; k++ )
|
||||
Cbuf[k] = (CostType)P2;
|
||||
|
||||
// Two vertical passes - up to down and down to up
|
||||
for( int pass = 1; pass <= 2; pass++ ) //TODO: rename this magic 2.
|
||||
{
|
||||
int y1, y2, dy;
|
||||
|
||||
if( pass == 1 )
|
||||
{
|
||||
y1 = 0; y2 = height; dy = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
y1 = height-1; y2 = -1; dy = -1;
|
||||
}
|
||||
|
||||
CostType *Lr[NLR]={0}, *minLr[NLR]={0}; //4e: arrays for L(x,y,r,d) of previous and current rows and minimums of them
|
||||
|
||||
for( k = 0; k < NLR; k++ ) //4e: One of them is needed, and one of them is stored. So, we need to swap pointer
|
||||
{ //4e: Yes, and this is done at the end of next cycle, not here.
|
||||
// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
|
||||
// and will occasionally use negative indices with the arrays
|
||||
// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
|
||||
// however, then the alignment will be imperfect, i.e. bad for SSE,
|
||||
// thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
|
||||
Lr[k] = pixDiff + costBufSize + LrSize*k + D2*LrBorder + 8;
|
||||
memset( Lr[k] - LrBorder*D2 - 8, 0, LrSize*sizeof(CostType) );
|
||||
minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + LrBorder;
|
||||
memset( minLr[k] - LrBorder, 0, minLrSize*sizeof(CostType) );
|
||||
}
|
||||
|
||||
for( int y = y1; y != y2; y += dy )
|
||||
{
|
||||
int x, d;
|
||||
CostType* C = Cbuf + y*costBufSize;
|
||||
CostType* S = Sbuf + y*costBufSize;
|
||||
|
||||
if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
|
||||
{
|
||||
int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1; //4e: for first line's block sum we need calculate half-window of costs and only one for other
|
||||
|
||||
for( k = dy1; k <= dy2; k++ )
|
||||
{
|
||||
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize; //4e: Ring buffer for horizontally summed lines
|
||||
|
||||
if( k < height )
|
||||
{
|
||||
calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
|
||||
|
||||
memset(hsumAdd, 0, D*sizeof(CostType));
|
||||
for( x = 0; x <= SW2*D; x += D ) //4e: Calculation summed costs for all disparities in first pixel of line
|
||||
{
|
||||
int scale = x == 0 ? SW2 + 1 : 1;
|
||||
for( d = 0; d < D; d++ )
|
||||
hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
|
||||
}
|
||||
|
||||
if( y > 0 ) //4e: We calculate horizontal sums and forming full block sums for y coord by adding this horsums to previous line's sums and subtracting stored lowest
|
||||
{ //4e: horsum in hsumBuf. Exception is case y=0, where we need many iterations per lines to create full blocking sum.
|
||||
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
|
||||
const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize; //4e: Well, actually y>0, so we don't need this check: y==0
|
||||
|
||||
for( x = D; x < width1*D; x += D )
|
||||
{
|
||||
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
|
||||
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
|
||||
|
||||
// #if CV_SIMD128
|
||||
// if( useSIMD )
|
||||
// {
|
||||
// for( d = 0; d < D; d += 8 )
|
||||
// {
|
||||
// v_int16x8 hv = v_load(hsumAdd + x - D + d);
|
||||
// v_int16x8 Cx = v_load(Cprev + x + d);
|
||||
// v_int16x8 psub = v_load(pixSub + d);
|
||||
// v_int16x8 padd = v_load(pixAdd + d);
|
||||
// hv = (hv - psub + padd);
|
||||
// psub = v_load(hsumSub + x + d);
|
||||
// Cx = Cx - psub + hv;
|
||||
// v_store(hsumAdd + x + d, hv);
|
||||
// v_store(C + x + d, Cx);
|
||||
// }
|
||||
// }
|
||||
// else
|
||||
// #endif
|
||||
{
|
||||
for( d = 0; d < D; d++ )
|
||||
{
|
||||
int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
|
||||
C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( x = D; x < width1*D; x += D ) //4e: Calcluates horizontal sums if (y==0). This piece of code is calling SH2+1 times and then result is used in different way
|
||||
{ //4e: to create full blocks sum. That's why this code is isolated from upper case.
|
||||
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
|
||||
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
|
||||
|
||||
for( d = 0; d < D; d++ )
|
||||
hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( y == 0 ) //4e: Calculating first full block sum.
|
||||
{
|
||||
int scale = k == 0 ? SH2 + 1 : 1;
|
||||
for( x = 0; x < width1*D; x++ )
|
||||
C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
|
||||
}
|
||||
}
|
||||
|
||||
// also, clear the S buffer
|
||||
for( k = 0; k < width1*D; k++ ) //4e: only on first pass, so it keep old information, don't be confused
|
||||
S[k] = 0;
|
||||
}
|
||||
|
||||
// clear the left and the right borders
|
||||
memset( Lr[0] - D2*LrBorder - 8, 0, D2*LrBorder*sizeof(CostType) ); //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory
|
||||
memset( Lr[0] + width1*D2 - 8, 0, D2*LrBorder*sizeof(CostType) ); //4e: ...00000000|D2-16 of real costs value(and some of them are zeroes too)|00000000...
|
||||
memset( minLr[0] - LrBorder, 0, LrBorder*sizeof(CostType) );
|
||||
memset( minLr[0] + width1, 0, LrBorder*sizeof(CostType) );
|
||||
|
||||
/*
|
||||
[formula 13 in the paper]
|
||||
compute L_r(p, d) = C(p, d) +
|
||||
min(L_r(p-r, d),
|
||||
L_r(p-r, d-1) + P1,
|
||||
L_r(p-r, d+1) + P1,
|
||||
min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
|
||||
where p = (x,y), r is one of the directions.
|
||||
we process all the directions at once:
|
||||
0: r=(-dx, 0)
|
||||
1: r=(-1, -dy)
|
||||
2: r=(0, -dy)
|
||||
3: r=(1, -dy)
|
||||
4: r=(-2, -dy)
|
||||
5: r=(-1, -dy*2)
|
||||
6: r=(1, -dy*2)
|
||||
7: r=(2, -dy)
|
||||
*/
|
||||
for( x = 0; x != width; x++ )
|
||||
{
|
||||
int xd = x*D2;
|
||||
|
||||
int delta = minLr[1][x];
|
||||
|
||||
CostType* Lr_ppr = Lr[1] + xd;
|
||||
|
||||
Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
|
||||
|
||||
CostType* Lr_p = Lr[0] + xd;
|
||||
const CostType* Cp = C + x*D;
|
||||
CostType* Sp = S + x*D;
|
||||
|
||||
// #if CV_SIMD128
|
||||
// if( useSIMD )
|
||||
// {
|
||||
// v_int16x8 _P1 = v_setall_s16((short)P1);
|
||||
//
|
||||
// v_int16x8 _delta0 = v_setall_s16((short)delta0);
|
||||
// v_int16x8 _delta1 = v_setall_s16((short)delta1);
|
||||
// v_int16x8 _delta2 = v_setall_s16((short)delta2);
|
||||
// v_int16x8 _delta3 = v_setall_s16((short)delta3);
|
||||
// v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
|
||||
//
|
||||
// for( d = 0; d < D; d += 8 )
|
||||
// {
|
||||
// v_int16x8 Cpd = v_load(Cp + d);
|
||||
// v_int16x8 L0, L1, L2, L3;
|
||||
//
|
||||
// L0 = v_load(Lr_p0 + d);
|
||||
// L1 = v_load(Lr_p1 + d);
|
||||
// L2 = v_load(Lr_ppr + d);
|
||||
// L3 = v_load(Lr_p3 + d);
|
||||
//
|
||||
// L0 = v_min(L0, (v_load(Lr_p0 + d - 1) + _P1));
|
||||
// L0 = v_min(L0, (v_load(Lr_p0 + d + 1) + _P1));
|
||||
//
|
||||
// L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
|
||||
// L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
|
||||
//
|
||||
// L2 = v_min(L2, (v_load(Lr_ppr + d - 1) + _P1));
|
||||
// L2 = v_min(L2, (v_load(Lr_ppr + d + 1) + _P1));
|
||||
//
|
||||
// L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
|
||||
// L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
|
||||
//
|
||||
// L0 = v_min(L0, _delta0);
|
||||
// L0 = ((L0 - _delta0) + Cpd);
|
||||
//
|
||||
// L1 = v_min(L1, _delta1);
|
||||
// L1 = ((L1 - _delta1) + Cpd);
|
||||
//
|
||||
// L2 = v_min(L2, _delta2);
|
||||
// L2 = ((L2 - _delta2) + Cpd);
|
||||
//
|
||||
// L3 = v_min(L3, _delta3);
|
||||
// L3 = ((L3 - _delta3) + Cpd);
|
||||
//
|
||||
// v_store(Lr_p + d, L0);
|
||||
// v_store(Lr_p + d + D2, L1);
|
||||
// v_store(Lr_p + d + D2*2, L2);
|
||||
// v_store(Lr_p + d + D2*3, L3);
|
||||
//
|
||||
// // Get minimum from in L0-L3
|
||||
// v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
|
||||
// v_zip(L0, L2, t02L, t02H); // L0[0] L2[0] L0[1] L2[1]...
|
||||
// v_zip(L1, L3, t13L, t13H); // L1[0] L3[0] L1[1] L3[1]...
|
||||
// v_int16x8 t02 = v_min(t02L, t02H); // L0[i] L2[i] L0[i] L2[i]...
|
||||
// v_int16x8 t13 = v_min(t13L, t13H); // L1[i] L3[i] L1[i] L3[i]...
|
||||
// v_zip(t02, t13, t0123L, t0123H); // L0[i] L1[i] L2[i] L3[i]...
|
||||
// v_int16x8 t0 = v_min(t0123L, t0123H);
|
||||
// _minL0 = v_min(_minL0, t0);
|
||||
//
|
||||
// v_int16x8 Sval = v_load(Sp + d);
|
||||
//
|
||||
// L0 = L0 + L1;
|
||||
// L2 = L2 + L3;
|
||||
// Sval = Sval + L0;
|
||||
// Sval = Sval + L2;
|
||||
//
|
||||
// v_store(Sp + d, Sval);
|
||||
// }
|
||||
//
|
||||
// v_int32x4 minL, minH;
|
||||
// v_expand(_minL0, minL, minH);
|
||||
// v_pack_store(&minLr[0][x], v_min(minL, minH));
|
||||
// }
|
||||
// else
|
||||
// #endif
|
||||
{
|
||||
int minL = MAX_COST;
|
||||
|
||||
for( d = 0; d < D; d++ )
|
||||
{
|
||||
int Cpd = Cp[d], L; //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
|
||||
|
||||
L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
|
||||
|
||||
Lr_p[d] = (CostType)L;
|
||||
minL0 = std::min(minL, L);
|
||||
|
||||
Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
|
||||
}
|
||||
minLr[0][x] = (CostType)minL;
|
||||
}
|
||||
}
|
||||
|
||||
// now shift the cyclic buffers
|
||||
std::swap( Lr[0], Lr[1] );
|
||||
std::swap( minLr[0], minLr[1] );
|
||||
}
|
||||
}
|
||||
|
||||
// for( int pass = 1; pass <= 2; pass++ ) //pass=1 or left-to-right pass
|
||||
{
|
||||
|
||||
CostType *Lr, *minLr;
|
||||
|
||||
{ //4e: Yes, and this is done at the end of next cycle, not here.
|
||||
// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
|
||||
// and will occasionally use negative indices with the arrays
|
||||
// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
|
||||
// however, then the alignment will be imperfect, i.e. bad for SSE,
|
||||
// thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
|
||||
Lr = pixDiff + costBufSize + D2*LrBorder + 8;
|
||||
memset( Lr - LrBorder*D2 - 8, 0, LrSize*sizeof(CostType) );
|
||||
minLr = pixDiff + costBufSize + LrSize*NLR + LrBorder;
|
||||
memset( minLr - LrBorder, 0, minLrSize*sizeof(CostType) );
|
||||
}
|
||||
|
||||
for( int y = 0; y != height; y++)
|
||||
{
|
||||
int x, d;
|
||||
DispType* disp1ptr = disp1.ptr<DispType>(y);
|
||||
CostType* C = Cbuf + costBufSize;
|
||||
CostType* S = Sbuf + costBufSize;
|
||||
|
||||
for( x = 0; x < width; x++ )
|
||||
{
|
||||
disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
|
||||
disp2cost[x] = MAX_COST;
|
||||
}
|
||||
|
||||
// clear the left and the right borders //TODO: Well, two of that memsets we could delete, but rest of them is direction-dependent
|
||||
memset( Lr - D2*LrBorder - 8, 0, D2*LrBorder*sizeof(CostType) ); //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory
|
||||
memset( Lr + width1*D2 - 8, 0, D2*LrBorder*sizeof(CostType) ); //4e: ...00000000|D2-16 of real costs value(and some of them are zeroes too)|00000000...
|
||||
memset( minLr - LrBorder, 0, LrBorder*sizeof(CostType) );
|
||||
memset( minLr + width1, 0, LrBorder*sizeof(CostType) );
|
||||
|
||||
/*
|
||||
[formula 13 in the paper]
|
||||
compute L_r(p, d) = C(p, d) +
|
||||
min(L_r(p-r, d),
|
||||
L_r(p-r, d-1) + P1,
|
||||
L_r(p-r, d+1) + P1,
|
||||
min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
|
||||
where p = (x,y), r is one of the directions.
|
||||
we process all the directions at once:
|
||||
0: r=(-dx, 0)
|
||||
1: r=(-1, -dy)
|
||||
2: r=(0, -dy)
|
||||
3: r=(1, -dy)
|
||||
4: r=(-2, -dy)
|
||||
5: r=(-1, -dy*2)
|
||||
6: r=(1, -dy*2)
|
||||
7: r=(2, -dy)
|
||||
*/
|
||||
for( x = 0; x != width1; x++)
|
||||
{
|
||||
int xd = x*D2;
|
||||
|
||||
int delta = minLr[x - 1];
|
||||
|
||||
CostType* Lr_ppr = Lr + xd - D2;
|
||||
|
||||
Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
|
||||
|
||||
CostType* Lr_p = Lr + xd;
|
||||
const CostType* Cp = C + x*D;
|
||||
CostType* Sp = S + x*D;
|
||||
|
||||
// #if CV_SIMD128
|
||||
// if( useSIMD )
|
||||
// {
|
||||
// v_int16x8 _P1 = v_setall_s16((short)P1);
|
||||
//
|
||||
// v_int16x8 _delta0 = v_setall_s16((short)delta0);
|
||||
// v_int16x8 _delta1 = v_setall_s16((short)delta1);
|
||||
// v_int16x8 _delta2 = v_setall_s16((short)delta2);
|
||||
// v_int16x8 _delta3 = v_setall_s16((short)delta3);
|
||||
// v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
|
||||
//
|
||||
// for( d = 0; d < D; d += 8 )
|
||||
// {
|
||||
// v_int16x8 Cpd = v_load(Cp + d);
|
||||
// v_int16x8 L0, L1, L2, L3;
|
||||
//
|
||||
// L0 = v_load(Lr_ppr + d);
|
||||
// L1 = v_load(Lr_p1 + d);
|
||||
// L2 = v_load(Lr_p2 + d);
|
||||
// L3 = v_load(Lr_p3 + d);
|
||||
//
|
||||
// L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1));
|
||||
// L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1));
|
||||
//
|
||||
// L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
|
||||
// L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
|
||||
//
|
||||
// L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1));
|
||||
// L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1));
|
||||
//
|
||||
// L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
|
||||
// L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
|
||||
//
|
||||
// L0 = v_min(L0, _delta0);
|
||||
// L0 = ((L0 - _delta0) + Cpd);
|
||||
//
|
||||
// L1 = v_min(L1, _delta1);
|
||||
// L1 = ((L1 - _delta1) + Cpd);
|
||||
//
|
||||
// L2 = v_min(L2, _delta2);
|
||||
// L2 = ((L2 - _delta2) + Cpd);
|
||||
//
|
||||
// L3 = v_min(L3, _delta3);
|
||||
// L3 = ((L3 - _delta3) + Cpd);
|
||||
//
|
||||
// v_store(Lr_p + d, L0);
|
||||
// v_store(Lr_p + d + D2, L1);
|
||||
// v_store(Lr_p + d + D2*2, L2);
|
||||
// v_store(Lr_p + d + D2*3, L3);
|
||||
//
|
||||
// // Get minimum from in L0-L3
|
||||
// v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
|
||||
// v_zip(L0, L2, t02L, t02H); // L0[0] L2[0] L0[1] L2[1]...
|
||||
// v_zip(L1, L3, t13L, t13H); // L1[0] L3[0] L1[1] L3[1]...
|
||||
// v_int16x8 t02 = v_min(t02L, t02H); // L0[i] L2[i] L0[i] L2[i]...
|
||||
// v_int16x8 t13 = v_min(t13L, t13H); // L1[i] L3[i] L1[i] L3[i]...
|
||||
// v_zip(t02, t13, t0123L, t0123H); // L0[i] L1[i] L2[i] L3[i]...
|
||||
// v_int16x8 t0 = v_min(t0123L, t0123H);
|
||||
// _minL0 = v_min(_minL0, t0);
|
||||
//
|
||||
// v_int16x8 Sval = v_load(Sp + d);
|
||||
//
|
||||
// L0 = L0 + L1;
|
||||
// L2 = L2 + L3;
|
||||
// Sval = Sval + L0;
|
||||
// Sval = Sval + L2;
|
||||
//
|
||||
// v_store(Sp + d, Sval);
|
||||
// }
|
||||
//
|
||||
// v_int32x4 minL, minH;
|
||||
// v_expand(_minL0, minL, minH);
|
||||
// v_pack_store(&minLr[x], v_min(minL, minH));
|
||||
// }
|
||||
// else
|
||||
// #endif
|
||||
{
|
||||
int minL = MAX_COST;
|
||||
|
||||
for( d = 0; d < D; d++ )
|
||||
{
|
||||
int Cpd = Cp[d], L; //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
|
||||
|
||||
L0 = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
|
||||
|
||||
Lr_p[d] = (CostType)L;
|
||||
minL0 = std::min(minL, L);
|
||||
|
||||
Sp[d] = saturate_cast<CostType>(Sp[d] + L);
|
||||
}
|
||||
minLr[x] = (CostType)minL0;
|
||||
}
|
||||
}
|
||||
|
||||
for( x = width1-1; x != -1; x--)
|
||||
{
|
||||
int xd = x*D2;
|
||||
|
||||
int delta = minLr[x + 1];
|
||||
|
||||
CostType* Lr_ppr = Lr + xd + D2;
|
||||
|
||||
Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
|
||||
|
||||
CostType* Lr_p = Lr + xd;
|
||||
const CostType* Cp = C + x*D;
|
||||
CostType* Sp = S + x*D;
|
||||
int minS = MAX_COST, bestDisp = -1;
|
||||
|
||||
// #if CV_SIMD128
|
||||
// if( useSIMD )
|
||||
// {
|
||||
// v_int16x8 _P1 = v_setall_s16((short)P1);
|
||||
//
|
||||
// v_int16x8 _delta0 = v_setall_s16((short)delta0);
|
||||
// v_int16x8 _delta1 = v_setall_s16((short)delta1);
|
||||
// v_int16x8 _delta2 = v_setall_s16((short)delta2);
|
||||
// v_int16x8 _delta3 = v_setall_s16((short)delta3);
|
||||
// v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
|
||||
//
|
||||
// for( d = 0; d < D; d += 8 )
|
||||
// {
|
||||
// v_int16x8 Cpd = v_load(Cp + d);
|
||||
// v_int16x8 L0, L1, L2, L3;
|
||||
//
|
||||
// L0 = v_load(Lr_ppr + d);
|
||||
// L1 = v_load(Lr_p1 + d);
|
||||
// L2 = v_load(Lr_p2 + d);
|
||||
// L3 = v_load(Lr_p3 + d);
|
||||
//
|
||||
// L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1));
|
||||
// L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1));
|
||||
//
|
||||
// L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
|
||||
// L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
|
||||
//
|
||||
// L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1));
|
||||
// L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1));
|
||||
//
|
||||
// L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
|
||||
// L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
|
||||
//
|
||||
// L0 = v_min(L0, _delta0);
|
||||
// L0 = ((L0 - _delta0) + Cpd);
|
||||
//
|
||||
// L1 = v_min(L1, _delta1);
|
||||
// L1 = ((L1 - _delta1) + Cpd);
|
||||
//
|
||||
// L2 = v_min(L2, _delta2);
|
||||
// L2 = ((L2 - _delta2) + Cpd);
|
||||
//
|
||||
// L3 = v_min(L3, _delta3);
|
||||
// L3 = ((L3 - _delta3) + Cpd);
|
||||
//
|
||||
// v_store(Lr_p + d, L0);
|
||||
// v_store(Lr_p + d + D2, L1);
|
||||
// v_store(Lr_p + d + D2*2, L2);
|
||||
// v_store(Lr_p + d + D2*3, L3);
|
||||
//
|
||||
// // Get minimum from in L0-L3
|
||||
// v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
|
||||
// v_zip(L0, L2, t02L, t02H); // L0[0] L2[0] L0[1] L2[1]...
|
||||
// v_zip(L1, L3, t13L, t13H); // L1[0] L3[0] L1[1] L3[1]...
|
||||
// v_int16x8 t02 = v_min(t02L, t02H); // L0[i] L2[i] L0[i] L2[i]...
|
||||
// v_int16x8 t13 = v_min(t13L, t13H); // L1[i] L3[i] L1[i] L3[i]...
|
||||
// v_zip(t02, t13, t0123L, t0123H); // L0[i] L1[i] L2[i] L3[i]...
|
||||
// v_int16x8 t0 = v_min(t0123L, t0123H);
|
||||
// _minL0 = v_min(_minL0, t0);
|
||||
//
|
||||
// v_int16x8 Sval = v_load(Sp + d);
|
||||
//
|
||||
// L0 = L0 + L1;
|
||||
// L2 = L2 + L3;
|
||||
// Sval = Sval + L0;
|
||||
// Sval = Sval + L2;
|
||||
//
|
||||
// v_store(Sp + d, Sval);
|
||||
// }
|
||||
//
|
||||
// v_int32x4 minL, minH;
|
||||
// v_expand(_minL0, minL, minH);
|
||||
// v_pack_store(&minLr[x], v_min(minL, minH));
|
||||
// }
|
||||
// else
|
||||
// #endif
|
||||
//TODO:Next piece of code is came from postprocessing. Be very careful with joining them!!!
|
||||
// #if CV_SIMD128
|
||||
// if( useSIMD )
|
||||
// {
|
||||
// v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);
|
||||
// v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);
|
||||
//
|
||||
// for( d = 0; d < D; d+= 8 )
|
||||
// {
|
||||
// v_int16x8 L0 = v_load(Sp + d);
|
||||
// v_int16x8 mask = L0 < _minS;
|
||||
// _minS = v_min( L0, _minS );
|
||||
// _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
|
||||
// _d8 = _d8 + _8;
|
||||
// }
|
||||
// v_int32x4 _d0, _d1;
|
||||
// v_expand(_minS, _d0, _d1);
|
||||
// minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
|
||||
// v_int16x8 v_mask = v_setall_s16((short)minS) == _minS;
|
||||
//
|
||||
// _bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);
|
||||
// v_expand(_bestDisp, _d0, _d1);
|
||||
// bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
|
||||
// }
|
||||
// else
|
||||
// #endif
|
||||
{
|
||||
int minL = MAX_COST;
|
||||
|
||||
for( d = 0; d < D; d++ )
|
||||
{
|
||||
int Cpd = Cp[d], L; //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
|
||||
|
||||
L0 = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
|
||||
|
||||
Lr_p[d] = (CostType)L;
|
||||
minL0 = std::min(minL, L);
|
||||
|
||||
Sp[d] = saturate_cast<CostType>(Sp[d] + L);
|
||||
if( Sp[d] < minS )
|
||||
{
|
||||
minS = Sp[d];
|
||||
bestDisp = d;
|
||||
}
|
||||
}
|
||||
minLr[x] = (CostType)minL0;
|
||||
}
|
||||
//Some postprocessing procedures and saving
|
||||
for( d = 0; d < D; d++ )
|
||||
{
|
||||
if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
|
||||
break;
|
||||
}
|
||||
if( d < D )
|
||||
continue;
|
||||
d = bestDisp;
|
||||
int _x2 = x + minX1 - d - minD;
|
||||
if( disp2cost[_x2] > minS )
|
||||
{
|
||||
disp2cost[_x2] = (CostType)minS;
|
||||
disp2ptr[_x2] = (DispType)(d + minD);
|
||||
}
|
||||
|
||||
if( 0 < d && d < D-1 )
|
||||
{
|
||||
// do subpixel quadratic interpolation:
|
||||
// fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
|
||||
// then find minimum of the parabola.
|
||||
int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
|
||||
d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
|
||||
}
|
||||
else
|
||||
d *= DISP_SCALE;
|
||||
disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
|
||||
}
|
||||
//Left-right check sanity procedure
|
||||
for( x = minX1; x < maxX1; x++ )
|
||||
{
|
||||
// we round the computed disparity both towards -inf and +inf and check
|
||||
// if either of the corresponding disparities in disp2 is consistent.
|
||||
// This is to give the computed disparity a chance to look valid if it is.
|
||||
int d1 = disp1ptr[x];
|
||||
if( d1 == INVALID_DISP_SCALED )
|
||||
continue;
|
||||
int _d = d1 >> DISP_SHIFT;
|
||||
int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
|
||||
int _x = x - _d, x_ = x - d_;
|
||||
if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff && //4e: To dismiss disparity, we should assure, that there is no any
|
||||
0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff ) //4e: chance to understand this as correct.
|
||||
disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
|
||||
CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
|
||||
PixType*& tmpBuf, CostType*& horPassCostVolume,
|
||||
|
Loading…
Reference in New Issue
Block a user