opencv/modules/stereo/src/stereobm.cpp

//M*//////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                          License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000, Intel Corporation, all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

/****************************************************************************************\
*    Very fast SAD-based (Sum-of-Absolute-Diffrences) stereo correspondence algorithm.   *
*    Contributed by Kurt Konolige                                                        *
\****************************************************************************************/

#include "precomp.hpp"
#include <stdio.h>
#include <limits>
#include <vector>
#include "opencl_kernels_stereo.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "opencv2/core/utils/buffer_area.private.hpp"

namespace cv {

struct StereoBMParams
{
    StereoBMParams(int _numDisparities=64, int _SADWindowSize=21)
    {
        preFilterType = StereoBM::PREFILTER_XSOBEL;
        preFilterSize = 9;
        preFilterCap = 31;
        SADWindowSize = _SADWindowSize;
        minDisparity = 0;
        numDisparities = _numDisparities > 0 ? _numDisparities : 64;
        textureThreshold = 10;
        uniquenessRatio = 15;
        speckleRange = speckleWindowSize = 0;
        roi1 = roi2 = Rect(0,0,0,0);
        disp12MaxDiff = -1;
        dispType = CV_16S;
    }

    int preFilterType;
    int preFilterSize;
    int preFilterCap;
    int SADWindowSize;
    int minDisparity;
    int numDisparities;
    int textureThreshold;
    int uniquenessRatio;
    int speckleRange;
    int speckleWindowSize;
    Rect roi1, roi2;
    int disp12MaxDiff;
    int dispType;

    inline bool useShorts() const
    {
        return preFilterCap <= 31 && SADWindowSize <= 21;
    }
    inline bool useFilterSpeckles() const
    {
        return speckleRange >= 0 && speckleWindowSize > 0;
    }
    inline bool useNormPrefilter() const
    {
        return preFilterType == StereoBM::PREFILTER_NORMALIZED_RESPONSE;
    }
};

#ifdef HAVE_OPENCL
static bool ocl_prefilter_norm(InputArray _input, OutputArray _output, int winsize, int prefilterCap)
{
    ocl::Kernel k("prefilter_norm", ocl::stereo::stereobm_oclsrc, format("-D WSZ=%d", winsize));
    if(k.empty())
        return false;

    int scale_g = winsize*winsize/8, scale_s = (1024 + scale_g)/(scale_g*2);
    scale_g *= scale_s;

    UMat input = _input.getUMat(), output;
    _output.create(input.size(), input.type());
    output = _output.getUMat();

    size_t globalThreads[3] = { (size_t)input.cols, (size_t)input.rows, 1 };

    k.args(ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output), input.rows, input.cols,
        prefilterCap, scale_g, scale_s);

    return k.run(2, globalThreads, NULL, false);
}
#endif

static void prefilterNorm( const Mat& src, Mat& dst, int winsize, int ftzero, int *buf )
{
    int x, y, wsz2 = winsize/2;
    int* vsum = buf + (wsz2 + 1);
    int scale_g = winsize*winsize/8, scale_s = (1024 + scale_g)/(scale_g*2);
    const int OFS = 256*5, TABSZ = OFS*2 + 256;
    uchar tab[TABSZ];
    const uchar* sptr = src.ptr();
    int srcstep = (int)src.step;
    Size size = src.size();

    scale_g *= scale_s;

    for( x = 0; x < TABSZ; x++ )
        tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero*2 : x - OFS + ftzero);

    for( x = 0; x < size.width; x++ )
        vsum[x] = (ushort)(sptr[x]*(wsz2 + 2));

    for( y = 1; y < wsz2; y++ )
    {
        for( x = 0; x < size.width; x++ )
            vsum[x] = (ushort)(vsum[x] + sptr[srcstep*y + x]);
    }

    for( y = 0; y < size.height; y++ )
    {
        const uchar* top = sptr + srcstep*MAX(y-wsz2-1,0);
        const uchar* bottom = sptr + srcstep*MIN(y+wsz2,size.height-1);
        const uchar* prev = sptr + srcstep*MAX(y-1,0);
        const uchar* curr = sptr + srcstep*y;
        const uchar* next = sptr + srcstep*MIN(y+1,size.height-1);
        uchar* dptr = dst.ptr<uchar>(y);

        for( x = 0; x < size.width; x++ )
            vsum[x] = (ushort)(vsum[x] + bottom[x] - top[x]);

        for( x = 0; x <= wsz2; x++ )
        {
            vsum[-x-1] = vsum[0];
            vsum[size.width+x] = vsum[size.width-1];
        }

        int sum = vsum[0]*(wsz2 + 1);
        for( x = 1; x <= wsz2; x++ )
            sum += vsum[x];

        int val = ((curr[0]*5 + curr[1] + prev[0] + next[0])*scale_g - sum*scale_s) >> 10;
        dptr[0] = tab[val + OFS];

        for( x = 1; x < size.width-1; x++ )
        {
            sum += vsum[x+wsz2] - vsum[x-wsz2-1];
            val = ((curr[x]*4 + curr[x-1] + curr[x+1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
            dptr[x] = tab[val + OFS];
        }

        sum += vsum[x+wsz2] - vsum[x-wsz2-1];
        val = ((curr[x]*5 + curr[x-1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
        dptr[x] = tab[val + OFS];
    }
}

#ifdef HAVE_OPENCL
static bool ocl_prefilter_xsobel(InputArray _input, OutputArray _output, int prefilterCap)
{
    ocl::Kernel k("prefilter_xsobel", ocl::stereo::stereobm_oclsrc);
    if(k.empty())
        return false;

    UMat input = _input.getUMat(), output;
    _output.create(input.size(), input.type());
    output = _output.getUMat();

    size_t globalThreads[3] = { (size_t)input.cols, (size_t)input.rows, 1 };

    k.args(ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output), input.rows, input.cols, prefilterCap);

    return k.run(2, globalThreads, NULL, false);
}
#endif

static void
prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
{
    int x, y;
    const int OFS = 256*4, TABSZ = OFS*2 + 256;
    uchar tab[TABSZ] = { 0 };
    Size size = src.size();

    for( x = 0; x < TABSZ; x++ )
        tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero*2 : x - OFS + ftzero);
    uchar val0 = tab[0 + OFS];

    for( y = 0; y < size.height-1; y += 2 )
    {
        const uchar* srow1 = src.ptr<uchar>(y);
        const uchar* srow0 = y > 0 ? srow1 - src.step : size.height > 1 ? srow1 + src.step : srow1;
        const uchar* srow2 = y < size.height-1 ? srow1 + src.step : size.height > 1 ? srow1 - src.step : srow1;
        const uchar* srow3 = y < size.height-2 ? srow1 + src.step*2 : srow1;
        uchar* dptr0 = dst.ptr<uchar>(y);
        uchar* dptr1 = dptr0 + dst.step;

        dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0;
        x = 1;

#if CV_SIMD
        {
            v_int16 ftz = vx_setall_s16((short) ftzero);
            v_int16 ftz2 = vx_setall_s16((short)(ftzero*2));
            v_int16 z = vx_setzero_s16();

            for(; x <= (size.width - 1) - v_int16::nlanes; x += v_int16::nlanes)
            {
                v_int16 s00 = v_reinterpret_as_s16(vx_load_expand(srow0 + x + 1));
                v_int16 s01 = v_reinterpret_as_s16(vx_load_expand(srow0 + x - 1));
                v_int16 s10 = v_reinterpret_as_s16(vx_load_expand(srow1 + x + 1));
                v_int16 s11 = v_reinterpret_as_s16(vx_load_expand(srow1 + x - 1));
                v_int16 s20 = v_reinterpret_as_s16(vx_load_expand(srow2 + x + 1));
                v_int16 s21 = v_reinterpret_as_s16(vx_load_expand(srow2 + x - 1));
                v_int16 s30 = v_reinterpret_as_s16(vx_load_expand(srow3 + x + 1));
                v_int16 s31 = v_reinterpret_as_s16(vx_load_expand(srow3 + x - 1));

                v_int16 d0 = s00 - s01;
                v_int16 d1 = s10 - s11;
                v_int16 d2 = s20 - s21;
                v_int16 d3 = s30 - s31;

                v_uint16 v0 = v_reinterpret_as_u16(v_max(v_min(d0 + d1 + d1 + d2 + ftz, ftz2), z));
                v_uint16 v1 = v_reinterpret_as_u16(v_max(v_min(d1 + d2 + d2 + d3 + ftz, ftz2), z));

                v_pack_store(dptr0 + x, v0);
                v_pack_store(dptr1 + x, v1);
            }
        }
#endif

        for( ; x < size.width-1; x++ )
        {
            int d0 = srow0[x+1] - srow0[x-1], d1 = srow1[x+1] - srow1[x-1],
            d2 = srow2[x+1] - srow2[x-1], d3 = srow3[x+1] - srow3[x-1];
            int v0 = tab[d0 + d1*2 + d2 + OFS];
            int v1 = tab[d1 + d2*2 + d3 + OFS];
            dptr0[x] = (uchar)v0;
            dptr1[x] = (uchar)v1;
        }
    }

    for( ; y < size.height; y++ )
    {
        uchar* dptr = dst.ptr<uchar>(y);
        x = 0;
#if CV_SIMD
        {
            v_uint8 val0_16 = vx_setall_u8(val0);
            for(; x <= size.width-v_uint8::nlanes; x+=v_uint8::nlanes)
                v_store(dptr + x, val0_16);
        }
#endif
        for(; x < size.width; x++ )
            dptr[x] = val0;
    }
}


static const int DISPARITY_SHIFT_16S = 4;
static const int DISPARITY_SHIFT_32S = 8;

template <typename T>
struct dispShiftTemplate
{ };

template<>
struct dispShiftTemplate<short>
{
    enum { value = DISPARITY_SHIFT_16S };
};

template<>
struct dispShiftTemplate<int>
{
    enum { value = DISPARITY_SHIFT_32S };
};

template <typename T>
inline T dispDescale(int /*v1*/, int /*v2*/, int /*d*/);

template<>
inline short dispDescale(int v1, int v2, int d)
{
    return (short)((v1*256 + (d != 0 ? v2*256/d : 0) + 15) >> 4);
}

template <>
inline int dispDescale(int v1, int v2, int d)
{
    return (int)(v1*256 + (d != 0 ? v2*256/d : 0)); // no need to add 127, this will be converted to float
}


class BufferBM
{
    static const int TABSZ = 256;
public:
    std::vector<int*> sad;
    std::vector<int*> hsad;
    std::vector<int*> htext;
    std::vector<uchar*> cbuf0;
    std::vector<ushort*> sad_short;
    std::vector<ushort*> hsad_short;
    int *prefilter[2];
    uchar tab[TABSZ];
private:
    utils::BufferArea area;

public:
    BufferBM(size_t nstripes, size_t width, size_t height, const StereoBMParams& params)
        : sad(nstripes, NULL),
        hsad(nstripes, NULL),
        htext(nstripes, NULL),
        cbuf0(nstripes, NULL),
        sad_short(nstripes, NULL),
        hsad_short(nstripes, NULL),
        prefilter()
    {
        const int wsz = params.SADWindowSize;
        const int ndisp = params.numDisparities;
        const int ftzero = params.preFilterCap;
        for (size_t i = 0; i < nstripes; ++i)
        {
            // 1D: [1][  ndisp  ][1]
#if CV_SIMD
            if (params.useShorts())
                area.allocate(sad_short[i], ndisp + 2);
            else
#endif
                area.allocate(sad[i], ndisp + 2);

            // 2D: [ wsz/2 + 1 ][   height   ][ wsz/2 + 1 ] * [ ndisp ]
#if CV_SIMD
            if (params.useShorts())
                area.allocate(hsad_short[i], (height + wsz + 2) * ndisp);
            else
#endif
                area.allocate(hsad[i], (height + wsz + 2) * ndisp);

            // 1D: [ wsz/2 + 1 ][   height   ][ wsz/2 + 1 ]
            area.allocate(htext[i], (height + wsz + 2));

            // 3D: [ wsz/2 + 1 ][   height   ][ wsz/2 + 1 ] * [ ndisp ] * [ wsz/2 + 1 ][ wsz/2 + 1 ]
            area.allocate(cbuf0[i], ((height + wsz + 2) * ndisp * (wsz + 2) + 256));
        }
        if (params.useNormPrefilter())
        {
            for (size_t i = 0; i < 2; ++i)
                area.allocate(prefilter[i], width + params.preFilterSize + 2);
        }
        area.commit();

        // static table
        for (int x = 0; x < TABSZ; x++)
            tab[x] = (uchar)std::abs(x - ftzero);
    }
};

#if CV_SIMD
template <typename dType>
static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
                                            Mat& disp, Mat& cost, const StereoBMParams& state,
                                            int _dy0, int _dy1, const BufferBM & bufX, size_t bufNum )
{
    int x, y, d;
    int wsz = state.SADWindowSize, wsz2 = wsz/2;
    int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
    int ndisp = state.numDisparities;
    int mindisp = state.minDisparity;
    int lofs = MAX(ndisp - 1 + mindisp, 0);
    int rofs = -MIN(ndisp - 1 + mindisp, 0);
    int width = left.cols, height = left.rows;
    int width1 = width - rofs - ndisp + 1;
    int textureThreshold = state.textureThreshold;
    int uniquenessRatio = state.uniquenessRatio;
    const int disp_shift = dispShiftTemplate<dType>::value;
    dType FILTERED = (dType)((mindisp - 1) << disp_shift);

    ushort *hsad, *hsad_sub;
    uchar *cbuf;
    const uchar* lptr0 = left.ptr() + lofs;
    const uchar* rptr0 = right.ptr() + rofs;
    const uchar *lptr, *lptr_sub, *rptr;
    dType* dptr = disp.ptr<dType>();
    int sstep = (int)left.step;
    int dstep = (int)(disp.step/sizeof(dptr[0]));
    int cstep = (height + dy0 + dy1)*ndisp;
    short costbuf = 0;
    int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
    const uchar * tab = bufX.tab;
    short v_seq[v_int16::nlanes];
    for (short i = 0; i < v_int16::nlanes; ++i)
        v_seq[i] = i;

    ushort *sad = bufX.sad_short[bufNum] + 1;
    ushort *hsad0 = bufX.hsad_short[bufNum] + (wsz2 + 1) * ndisp;
    int *htext = bufX.htext[bufNum] + (wsz2 + 1);
    uchar *cbuf0 = bufX.cbuf0[bufNum] + (wsz2 + 1) * ndisp;

    // initialize buffers
    memset(sad - 1, 0, (ndisp + 2) * sizeof(sad[0]));
    memset(hsad0 - dy0 * ndisp, 0, (height + wsz + 2) * ndisp * sizeof(hsad[0]));
    memset(htext - dy0, 0, (height + wsz + 2) * sizeof(htext[0]));

    for( x = -wsz2-1; x < wsz2; x++ )
    {
        hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
        lptr = lptr0 + MIN(MAX(x, -lofs), width-lofs-1) - dy0*sstep;
        rptr = rptr0 + MIN(MAX(x, -rofs), width-rofs-ndisp) - dy0*sstep;

        for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
        {
            int lval = lptr[0];
            v_uint8 lv = vx_setall_u8((uchar)lval);
            for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
            {
                v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
                v_store(cbuf + d, diff);
                v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
                v_store(hsad + d + v_uint16::nlanes, vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff));
            }
            if( d <= ndisp - v_uint16::nlanes )
            {
                v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
                v_store_low(cbuf + d, diff);
                v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
                d += v_uint16::nlanes;
            }
            for( ; d < ndisp; d++ )
            {
                int diff = abs(lval - rptr[d]);
                cbuf[d] = (uchar)diff;
                hsad[d] += (ushort)diff;
            }
            htext[y] += tab[lval];
        }
    }

    // initialize the left and right borders of the disparity map
    for( y = 0; y < height; y++ )
    {
        for( x = 0; x < lofs; x++ )
            dptr[y*dstep + x] = FILTERED;
        for( x = lofs + width1; x < width; x++ )
            dptr[y*dstep + x] = FILTERED;
    }
    dptr += lofs;

    for( x = 0; x < width1; x++, dptr++ )
    {
        short* costptr = cost.data ? cost.ptr<short>() + lofs + x : &costbuf;
        int x0 = x - wsz2 - 1, x1 = x + wsz2;
        const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
        cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
        hsad = hsad0 - dy0*ndisp;
        lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep;
        lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep;
        rptr = rptr0 + MIN(MAX(x1, -rofs), width-ndisp-rofs) - dy0*sstep;

        for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
            hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
        {
            int lval = lptr[0];
            v_uint8 lv = vx_setall_u8((uchar)lval);
            for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
            {
                v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
                v_int8 cbs = v_reinterpret_as_s8(vx_load(cbuf_sub + d));
                v_store(cbuf + d, diff);
                v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - v_expand_low(cbs)));
                v_store(hsad + d + v_uint16::nlanes, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff)) - v_expand_high(cbs)));
            }
            if( d <= ndisp - v_uint16::nlanes)
            {
                v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
                v_store_low(cbuf + d, diff);
                v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - vx_load_expand((schar*)cbuf_sub + d)));
                d += v_uint16::nlanes;
            }
            for( ; d < ndisp; d++ )
            {
                int diff = abs(lval - rptr[d]);
                cbuf[d] = (uchar)diff;
                hsad[d] = hsad[d] + (ushort)diff - cbuf_sub[d];
            }
            htext[y] += tab[lval] - tab[lptr_sub[0]];
        }

        // fill borders
        for( y = dy1; y <= wsz2; y++ )
            htext[height+y] = htext[height+dy1-1];
        for( y = -wsz2-1; y < -dy0; y++ )
            htext[y] = htext[-dy0];

        // initialize sums
        for( d = 0; d < ndisp; d++ )
            sad[d] = (ushort)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0));

        hsad = hsad0 + (1 - dy0)*ndisp;
        for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
        {
            for( d = 0; d <= ndisp-2*v_uint16::nlanes; d += 2*v_uint16::nlanes )
            {
                v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
                v_store(sad + d + v_uint16::nlanes, vx_load(sad + d + v_uint16::nlanes) + vx_load(hsad + d + v_uint16::nlanes));
            }
            if( d <= ndisp-v_uint16::nlanes )
            {
                v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
                d += v_uint16::nlanes;
            }
            if( d <= ndisp-v_uint16::nlanes/2 )
            {
                v_store_low(sad + d, vx_load_low(sad + d) + vx_load_low(hsad + d));
                d += v_uint16::nlanes/2;
            }
            for( ; d < ndisp; d++ )
                sad[d] = sad[d] + hsad[d];
        }
        int tsum = 0;
        for( y = -wsz2-1; y < wsz2; y++ )
            tsum += htext[y];

        // finally, start the real processing
        for( y = 0; y < height; y++ )
        {
            int minsad = INT_MAX, mind = -1;
            hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
            hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
            v_int16 minsad8 = vx_setall_s16(SHRT_MAX);
            v_int16 mind8 = vx_setall_s16(0);

            for( d = 0; d <= ndisp - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
            {
                v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
                v_store(sad + d, v_reinterpret_as_u16(sad8));
                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
                minsad8 = v_min(minsad8, sad8);

                sad8 = v_reinterpret_as_s16(vx_load(hsad + d + v_int16::nlanes)) - v_reinterpret_as_s16(vx_load(hsad_sub + d + v_int16::nlanes)) + v_reinterpret_as_s16(vx_load(sad + d + v_int16::nlanes));
                v_store(sad + d + v_int16::nlanes, v_reinterpret_as_u16(sad8));
                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d+v_int16::nlanes));
                minsad8 = v_min(minsad8, sad8);
            }
            if( d <= ndisp - v_int16::nlanes )
            {
                v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
                v_store(sad + d, v_reinterpret_as_u16(sad8));
                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
                minsad8 = v_min(minsad8, sad8);
                d += v_int16::nlanes;
            }
            minsad = v_reduce_min(minsad8);
            v_int16 v_mask = (vx_setall_s16((short)minsad) == minsad8);
            mind = v_reduce_min(((mind8+vx_load(v_seq)) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
            for( ; d < ndisp; d++ )
            {
                int sad8 = (int)(hsad[d]) - hsad_sub[d] + sad[d];
                sad[d] = (ushort)sad8;
                if(minsad > sad8)
                {
                    mind = d;
                    minsad = sad8;
                }
            }

            tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
            if( tsum < textureThreshold )
            {
                dptr[y*dstep] = FILTERED;
                continue;
            }

            if( uniquenessRatio > 0 )
            {
                int thresh = minsad + (minsad * uniquenessRatio/100);
                v_int32 thresh4 = vx_setall_s32(thresh + 1);
                v_int32 d1 = vx_setall_s32(mind-1), d2 = vx_setall_s32(mind+1);
                v_int32 dd_4 = vx_setall_s32(v_int32::nlanes);
                v_int32 d4 = vx_load_expand(v_seq);

                for( d = 0; d <= ndisp - v_int16::nlanes; d += v_int16::nlanes )
                {
                    v_int32 sad4_l, sad4_h;
                    v_expand(v_reinterpret_as_s16(vx_load(sad + d)), sad4_l, sad4_h);
                    if( v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))) )
                        break;
                    d4 += dd_4;
                    if( v_check_any((thresh4 > sad4_h) & ((d1 > d4) | (d4 > d2))) )
                        break;
                    d4 += dd_4;
                }
                if( d <= ndisp - v_int16::nlanes )
                {
                    dptr[y*dstep] = FILTERED;
                    continue;
                }
                if( d <= ndisp - v_int32::nlanes )
                {
                    v_int32 sad4_l = vx_load_expand((short*)sad + d);
                    if (v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))))
                    {
                        dptr[y*dstep] = FILTERED;
                        continue;
                    }
                    d += v_int16::nlanes;
                }
                for( ; d < ndisp; d++ )
                {
                    if( (thresh + 1) > sad[d] && ((mind - 1) > d || d > (mind + 1)) )
                        break;
                }
                if( d < ndisp )
                {
                    dptr[y*dstep] = FILTERED;
                    continue;
                }
            }

            if( 0 < mind && mind < ndisp - 1 )
            {
                int p = sad[mind+1], n = sad[mind-1];
                d = p + n - 2*sad[mind] + std::abs(p - n);
                dptr[y*dstep] = dispDescale<dType>(ndisp - mind - 1 + mindisp, p-n, d);
            }
            else
                dptr[y*dstep] = dispDescale<dType>(ndisp - mind - 1 + mindisp, 0, 0);
            costptr[y*coststep] = sad[mind];
        }
    }
}
#endif

template <typename mType>
static void
findStereoCorrespondenceBM( const Mat& left, const Mat& right,
                            Mat& disp, Mat& cost, const StereoBMParams& state,
                            int _dy0, int _dy1, const BufferBM & bufX, size_t bufNum )
{

    int x, y, d;
    int wsz = state.SADWindowSize, wsz2 = wsz/2;
    int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
    int ndisp = state.numDisparities;
    int mindisp = state.minDisparity;
    int lofs = MAX(ndisp - 1 + mindisp, 0);
    int rofs = -MIN(ndisp - 1 + mindisp, 0);
    int width = left.cols, height = left.rows;
    int width1 = width - rofs - ndisp + 1;
    int textureThreshold = state.textureThreshold;
    int uniquenessRatio = state.uniquenessRatio;
    const int disp_shift = dispShiftTemplate<mType>::value;
    mType FILTERED = (mType)((mindisp - 1) << disp_shift);

    int *hsad, *hsad_sub;
    uchar *cbuf;
    const uchar* lptr0 = left.ptr() + lofs;
    const uchar* rptr0 = right.ptr() + rofs;
    const uchar *lptr, *lptr_sub, *rptr;
    mType* dptr = disp.ptr<mType>();
    int sstep = (int)left.step;
    int dstep = (int)(disp.step/sizeof(dptr[0]));
    int cstep = (height+dy0+dy1)*ndisp;
    int costbuf = 0;
    int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
    const uchar * tab = bufX.tab;

#if CV_SIMD
    int v_seq[v_int32::nlanes];
    for (int i = 0; i < v_int32::nlanes; ++i)
        v_seq[i] = i;
    v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(v_int32::nlanes);
#endif

    int *sad = bufX.sad[bufNum] + 1;
    int *hsad0 = bufX.hsad[bufNum] + (wsz2 + 1) * ndisp;
    int *htext = bufX.htext[bufNum] + (wsz2 + 1);
    uchar *cbuf0 = bufX.cbuf0[bufNum] + (wsz2 + 1) * ndisp;

    // initialize buffers
    memset(sad - 1, 0, (ndisp + 2) * sizeof(sad[0]));
    memset(hsad0 - dy0 * ndisp, 0, (height + wsz + 2) * ndisp * sizeof(hsad[0]));
    memset(htext - dy0, 0, (height + wsz + 2) * sizeof(htext[0]));

    for( x = -wsz2-1; x < wsz2; x++ )
    {
        hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
        lptr = lptr0 + std::min(std::max(x, -lofs), width-lofs-1) - dy0*sstep;
        rptr = rptr0 + std::min(std::max(x, -rofs), width-rofs-ndisp) - dy0*sstep;
        for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
        {
            int lval = lptr[0];
            d = 0;
#if CV_SIMD
            {
                v_uint8 lv = vx_setall_u8((uchar)lval);

                for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
                {
                    v_uint8 rv = vx_load(rptr + d);
                    v_int32 hsad_0 = vx_load(hsad + d);
                    v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
                    v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
                    v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
                    v_uint8 diff = v_absdiff(lv, rv);
                    v_store(cbuf + d, diff);

                    v_uint16 diff0, diff1;
                    v_uint32 diff00, diff01, diff10, diff11;
                    v_expand(diff, diff0, diff1);
                    v_expand(diff0, diff00, diff01);
                    v_expand(diff1, diff10, diff11);

                    hsad_0 += v_reinterpret_as_s32(diff00);
                    hsad_1 += v_reinterpret_as_s32(diff01);
                    hsad_2 += v_reinterpret_as_s32(diff10);
                    hsad_3 += v_reinterpret_as_s32(diff11);

                    v_store(hsad + d, hsad_0);
                    v_store(hsad + d + v_int32::nlanes, hsad_1);
                    v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
                    v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
                }
            }
#endif
            for( ; d < ndisp; d++ )
            {
                int diff = std::abs(lval - rptr[d]);
                cbuf[d] = (uchar)diff;
                hsad[d] = (int)(hsad[d] + diff);
            }
            htext[y] += tab[lval];
        }
    }

    // initialize the left and right borders of the disparity map
    for( y = 0; y < height; y++ )
    {
        for( x = 0; x < lofs; x++ )
            dptr[y*dstep + x] = FILTERED;
        for( x = lofs + width1; x < width; x++ )
            dptr[y*dstep + x] = FILTERED;
    }
    dptr += lofs;

    for( x = 0; x < width1; x++, dptr++ )
    {
        int* costptr = cost.data ? cost.ptr<int>() + lofs + x : &costbuf;
        int x0 = x - wsz2 - 1, x1 = x + wsz2;
        const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
        cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
        hsad = hsad0 - dy0*ndisp;
        lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep;
        lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep;
        rptr = rptr0 + MIN(MAX(x1, -rofs), width-ndisp-rofs) - dy0*sstep;

        for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
            hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
        {
            int lval = lptr[0];
            d = 0;
#if CV_SIMD
            {
                v_uint8 lv = vx_setall_u8((uchar)lval);
                for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
                {
                    v_uint8 rv = vx_load(rptr + d);
                    v_int32 hsad_0 = vx_load(hsad + d);
                    v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
                    v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
                    v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
                    v_uint8 cbs = vx_load(cbuf_sub + d);
                    v_uint8 diff = v_absdiff(lv, rv);
                    v_store(cbuf + d, diff);

                    v_uint16 diff0, diff1, cbs0, cbs1;
                    v_int32 diff00, diff01, diff10, diff11, cbs00, cbs01, cbs10, cbs11;
                    v_expand(diff, diff0, diff1);
                    v_expand(cbs, cbs0, cbs1);
                    v_expand(v_reinterpret_as_s16(diff0), diff00, diff01);
                    v_expand(v_reinterpret_as_s16(diff1), diff10, diff11);
                    v_expand(v_reinterpret_as_s16(cbs0), cbs00, cbs01);
                    v_expand(v_reinterpret_as_s16(cbs1), cbs10, cbs11);

                    v_int32 diff_0 = diff00 - cbs00;
                    v_int32 diff_1 = diff01 - cbs01;
                    v_int32 diff_2 = diff10 - cbs10;
                    v_int32 diff_3 = diff11 - cbs11;
                    hsad_0 += diff_0;
                    hsad_1 += diff_1;
                    hsad_2 += diff_2;
                    hsad_3 += diff_3;

                    v_store(hsad + d, hsad_0);
                    v_store(hsad + d + v_int32::nlanes, hsad_1);
                    v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
                    v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
                }
            }
#endif
            for( ; d < ndisp; d++ )
            {
                int diff = std::abs(lval - rptr[d]);
                cbuf[d] = (uchar)diff;
                hsad[d] = hsad[d] + diff - cbuf_sub[d];
            }
            htext[y] += tab[lval] - tab[lptr_sub[0]];
        }

        // fill borders
        for( y = dy1; y <= wsz2; y++ )
            htext[height+y] = htext[height+dy1-1];
        for( y = -wsz2-1; y < -dy0; y++ )
            htext[y] = htext[-dy0];

        // initialize sums
        for( d = 0; d < ndisp; d++ )
            sad[d] = (int)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0));

        hsad = hsad0 + (1 - dy0)*ndisp;
        for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
        {
            d = 0;
#if CV_SIMD
            {
                for( d = 0; d <= ndisp-2*v_int32::nlanes; d += 2*v_int32::nlanes )
                {
                    v_int32 s0 = vx_load(sad + d);
                    v_int32 s1 = vx_load(sad + d + v_int32::nlanes);
                    v_int32 t0 = vx_load(hsad + d);
                    v_int32 t1 = vx_load(hsad + d + v_int32::nlanes);
                    s0 += t0;
                    s1 += t1;
                    v_store(sad + d, s0);
                    v_store(sad + d + v_int32::nlanes, s1);
                }
            }
#endif
            for( ; d < ndisp; d++ )
                sad[d] = (int)(sad[d] + hsad[d]);
        }
        int tsum = 0;
        for( y = -wsz2-1; y < wsz2; y++ )
            tsum += htext[y];

        // finally, start the real processing
        for( y = 0; y < height; y++ )
        {
            int minsad = INT_MAX, mind = -1;
            hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
            hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
            d = 0;
#if CV_SIMD
            {
                v_int32 minsad4 = vx_setall_s32(INT_MAX);
                v_int32 mind4 = vx_setall_s32(0), d4 = d0_4;

                for( ; d <= ndisp - 2*v_int32::nlanes; d += 2*v_int32::nlanes )
                {
                    v_int32 sad4 = vx_load(sad + d) + vx_load(hsad + d) - vx_load(hsad_sub + d);
                    v_store(sad + d, sad4);
                    mind4 = v_select(minsad4 > sad4, d4, mind4);
                    minsad4 = v_min(minsad4, sad4);
                    d4 += dd_4;

                    sad4 = vx_load(sad + d + v_int32::nlanes) + vx_load(hsad + d + v_int32::nlanes) - vx_load(hsad_sub + d + v_int32::nlanes);
                    v_store(sad + d + v_int32::nlanes, sad4);
                    mind4 = v_select(minsad4 > sad4, d4, mind4);
                    minsad4 = v_min(minsad4, sad4);
                    d4 += dd_4;
                }

                int CV_DECL_ALIGNED(CV_SIMD_WIDTH) minsad_buf[v_int32::nlanes], mind_buf[v_int32::nlanes];
                v_store(minsad_buf, minsad4);
                v_store(mind_buf, mind4);
                for (int i = 0; i < v_int32::nlanes; ++i)
                    if(minsad_buf[i] < minsad || (minsad == minsad_buf[i] && mind_buf[i] < mind)) { minsad = minsad_buf[i]; mind = mind_buf[i]; }
            }
#endif
            for( ; d < ndisp; d++ )
            {
                int currsad = sad[d] + hsad[d] - hsad_sub[d];
                sad[d] = currsad;
                if( currsad < minsad )
                {
                    minsad = currsad;
                    mind = d;
                }
            }

            tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
            if( tsum < textureThreshold )
            {
                dptr[y*dstep] = FILTERED;
                continue;
            }

            if( uniquenessRatio > 0 )
            {
                int thresh = minsad + (minsad * uniquenessRatio/100);
                for( d = 0; d < ndisp; d++ )
                {
                    if( (d < mind-1 || d > mind+1) && sad[d] <= thresh)
                        break;
                }
                if( d < ndisp )
                {
                    dptr[y*dstep] = FILTERED;
                    continue;
                }
            }

            {
                sad[-1] = sad[1];
                sad[ndisp] = sad[ndisp-2];
                int p = sad[mind+1], n = sad[mind-1];
                d = p + n - 2*sad[mind] + std::abs(p - n);
                dptr[y*dstep] = dispDescale<mType>(ndisp - mind - 1 + mindisp, p-n, d);

                costptr[y*coststep] = sad[mind];
            }
        }
    }
}

#ifdef HAVE_OPENCL
static bool ocl_prefiltering(InputArray left0, InputArray right0, OutputArray left, OutputArray right, StereoBMParams* state)
{
    if (state->useNormPrefilter())
    {
        if(!ocl_prefilter_norm( left0, left, state->preFilterSize, state->preFilterCap))
            return false;
        if(!ocl_prefilter_norm( right0, right, state->preFilterSize, state->preFilterCap))
            return false;
    }
    else
    {
        if(!ocl_prefilter_xsobel( left0, left, state->preFilterCap ))
            return false;
        if(!ocl_prefilter_xsobel( right0, right, state->preFilterCap))
            return false;
    }
    return true;
}
#endif

struct PrefilterInvoker : public ParallelLoopBody
{
    PrefilterInvoker(const Mat& left0, const Mat& right0, Mat& left, Mat& right,
                     const BufferBM &bufX_, const StereoBMParams &state_)
        : bufX(bufX_), state(state_)
    {
        imgs0[0] = &left0; imgs0[1] = &right0;
        imgs[0] = &left; imgs[1] = &right;
    }

    void operator()(const Range& range) const CV_OVERRIDE
    {
        for( int i = range.start; i < range.end; i++ )
        {
            if (state.useNormPrefilter())
                prefilterNorm( *imgs0[i], *imgs[i], state.preFilterSize, state.preFilterCap, bufX.prefilter[i] );
            else
                prefilterXSobel( *imgs0[i], *imgs[i], state.preFilterCap );
        }
    }

    const Mat* imgs0[2];
    Mat* imgs[2];
    const BufferBM &bufX;
    const StereoBMParams &state;
};

#ifdef HAVE_OPENCL
static bool ocl_stereobm( InputArray _left, InputArray _right,
                       OutputArray _disp, StereoBMParams* state)
{
    int ndisp = state->numDisparities;
    int mindisp = state->minDisparity;
    int wsz = state->SADWindowSize;
    int wsz2 = wsz/2;

    ocl::Device devDef = ocl::Device::getDefault();
    int sizeX = devDef.isIntel() ? 32 : std::max(11, 27 - devDef.maxComputeUnits()),
        sizeY = sizeX - 1,
        N = ndisp * 2;

    cv::String opt = cv::format("-D DEFINE_KERNEL_STEREOBM -D MIN_DISP=%d -D NUM_DISP=%d"
                                " -D BLOCK_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D WSZ=%d",
                                mindisp, ndisp,
                                sizeX, sizeY, wsz);
    ocl::Kernel k("stereoBM", ocl::stereo::stereobm_oclsrc, opt);
    if(k.empty())
        return false;

    UMat left = _left.getUMat(), right = _right.getUMat();
    int cols = left.cols, rows = left.rows;

    _disp.create(_left.size(), CV_16S);
    _disp.setTo((mindisp - 1) << 4);
    Rect roi = Rect(Point(wsz2 + mindisp + ndisp - 1, wsz2), Point(cols-wsz2-mindisp, rows-wsz2) );
    UMat disp = (_disp.getUMat())(roi);

    int globalX = (disp.cols + sizeX - 1) / sizeX,
        globalY = (disp.rows + sizeY - 1) / sizeY;
    size_t globalThreads[3] = {(size_t)N, (size_t)globalX, (size_t)globalY};
    size_t localThreads[3]  = {(size_t)N, 1, 1};

    int idx = 0;
    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(left));
    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(right));
    idx = k.set(idx, ocl::KernelArg::WriteOnlyNoSize(disp));
    idx = k.set(idx, rows);
    idx = k.set(idx, cols);
    idx = k.set(idx, state->textureThreshold);
    idx = k.set(idx, state->uniquenessRatio);
    return k.run(3, globalThreads, localThreads, false);
}
#endif

struct FindStereoCorrespInvoker : public ParallelLoopBody
{
    FindStereoCorrespInvoker( const Mat& _left, const Mat& _right,
                             Mat& _disp, const StereoBMParams &_state,
                             int _nstripes,
                             Rect _validDisparityRect,
                             Mat& _cost, const BufferBM & buf_ )
        : state(_state), buf(buf_)
    {
        CV_Assert( _disp.type() == CV_16S || _disp.type() == CV_32S );
        left = &_left; right = &_right;
        disp = &_disp;
        nstripes = _nstripes;
        validDisparityRect = _validDisparityRect;
        cost = &_cost;
    }

    void operator()(const Range& range) const CV_OVERRIDE
    {
        int cols = left->cols, rows = left->rows;
        int _row0 = std::min(cvRound(range.start * rows / nstripes), rows);
        int _row1 = std::min(cvRound(range.end * rows / nstripes), rows);

        int dispShift = disp->type() == CV_16S ? DISPARITY_SHIFT_16S :
                                                 DISPARITY_SHIFT_32S;
        int FILTERED = (state.minDisparity - 1) << dispShift;

        Rect roi = validDisparityRect & Rect(0, _row0, cols, _row1 - _row0);
        if( roi.height == 0 )
            return;
        int row0 = roi.y;
        int row1 = roi.y + roi.height;

        Mat part;
        if( row0 > _row0 )
        {
            part = disp->rowRange(_row0, row0);
            part = Scalar::all(FILTERED);
        }
        if( _row1 > row1 )
        {
            part = disp->rowRange(row1, _row1);
            part = Scalar::all(FILTERED);
        }

        Mat left_i = left->rowRange(row0, row1);
        Mat right_i = right->rowRange(row0, row1);
        Mat disp_i = disp->rowRange(row0, row1);
        Mat cost_i = state.disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();

#if CV_SIMD
        if (state.useShorts())
        {
            if( disp_i.type() == CV_16S)
                findStereoCorrespondenceBM_SIMD<short>( left_i, right_i, disp_i, cost_i, state, row0, rows - row1, buf, range.start );
            else
                findStereoCorrespondenceBM_SIMD<int>( left_i, right_i, disp_i, cost_i, state, row0, rows - row1, buf, range.start);
        }
        else
#endif
        {
            if( disp_i.type() == CV_16S )
                findStereoCorrespondenceBM<short>( left_i, right_i, disp_i, cost_i, state, row0, rows - row1, buf, range.start );
            else
                findStereoCorrespondenceBM<int>( left_i, right_i, disp_i, cost_i, state, row0, rows - row1, buf, range.start );
        }

        if( state.disp12MaxDiff >= 0 )
            validateDisparity( disp_i, cost_i, state.minDisparity, state.numDisparities, state.disp12MaxDiff );

        if( roi.x > 0 )
        {
            part = disp_i.colRange(0, roi.x);
            part = Scalar::all(FILTERED);
        }
        if( roi.x + roi.width < cols )
        {
            part = disp_i.colRange(roi.x + roi.width, cols);
            part = Scalar::all(FILTERED);
        }
    }

protected:
    const Mat *left, *right;
    Mat* disp, *cost;
    const StereoBMParams &state;

    int nstripes;
    Rect validDisparityRect;
    const BufferBM & buf;
};

class StereoBMImpl CV_FINAL : public StereoBM
{
public:
    StereoBMImpl()
        : params()
    {
        // nothing
    }

    StereoBMImpl( int _numDisparities, int _SADWindowSize )
        : params(_numDisparities, _SADWindowSize)
    {
        // nothing
    }

    void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr ) CV_OVERRIDE
    {
        CV_INSTRUMENT_REGION();

        int dtype = disparr.fixedType() ? disparr.type() : params.dispType;
        Size leftsize = leftarr.size();

        if (leftarr.size() != rightarr.size())
            CV_Error( Error::StsUnmatchedSizes, "All the images must have the same size" );

        if (leftarr.type() != CV_8UC1 || rightarr.type() != CV_8UC1)
            CV_Error( Error::StsUnsupportedFormat, "Both input images must have CV_8UC1" );

        if (dtype != CV_16SC1 && dtype != CV_32FC1)
            CV_Error( Error::StsUnsupportedFormat, "Disparity image must have CV_16SC1 or CV_32FC1 format" );

        if( params.preFilterType != PREFILTER_NORMALIZED_RESPONSE &&
            params.preFilterType != PREFILTER_XSOBEL )
            CV_Error( Error::StsOutOfRange, "preFilterType must be = CV_STEREO_BM_NORMALIZED_RESPONSE" );

        if( params.preFilterSize < 5 || params.preFilterSize > 255 || params.preFilterSize % 2 == 0 )
            CV_Error( Error::StsOutOfRange, "preFilterSize must be odd and be within 5..255" );

        if( params.preFilterCap < 1 || params.preFilterCap > 63 )
            CV_Error( Error::StsOutOfRange, "preFilterCap must be within 1..63" );

        if( params.SADWindowSize < 5 || params.SADWindowSize > 255 || params.SADWindowSize % 2 == 0 ||
            params.SADWindowSize >= std::min(leftsize.width, leftsize.height) )
            CV_Error( Error::StsOutOfRange, "SADWindowSize must be odd, be within 5..255 and be not larger than image width or height" );

        if( params.numDisparities <= 0 || params.numDisparities % 16 != 0 )
            CV_Error( Error::StsOutOfRange, "numDisparities must be positive and divisible by 16" );

        if( params.textureThreshold < 0 )
            CV_Error( Error::StsOutOfRange, "texture threshold must be non-negative" );

        if( params.uniquenessRatio < 0 )
            CV_Error( Error::StsOutOfRange, "uniqueness ratio must be non-negative" );

        int disp_shift;
        if (dtype == CV_16SC1)
            disp_shift = DISPARITY_SHIFT_16S;
        else
            disp_shift = DISPARITY_SHIFT_32S;

        int FILTERED = (params.minDisparity - 1) << disp_shift;

#ifdef HAVE_OPENCL
        if(ocl::isOpenCLActivated() && disparr.isUMat() && params.textureThreshold == 0)
        {
            UMat left, right;
            if(ocl_prefiltering(leftarr, rightarr, left, right, &params))
            {
                if(ocl_stereobm(left, right, disparr, &params))
                {
                    disp_shift = DISPARITY_SHIFT_16S;
                    FILTERED = (params.minDisparity - 1) << disp_shift;

                    if (params.useFilterSpeckles())
                        filterSpeckles(disparr.getMat(), FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf);
                    if (dtype == CV_32F)
                        disparr.getUMat().convertTo(disparr, CV_32FC1, 1./(1 << disp_shift), 0);
                    CV_IMPL_ADD(CV_IMPL_OCL);
                    return;
                }
            }
        }
#endif

        Mat left0 = leftarr.getMat(), right0 = rightarr.getMat();
        disparr.create(left0.size(), dtype);
        Mat disp0 = disparr.getMat();

        preFilteredImg0.create( left0.size(), CV_8U );
        preFilteredImg1.create( left0.size(), CV_8U );
        cost.create( left0.size(), CV_16S );

        Mat left = preFilteredImg0, right = preFilteredImg1;

        int mindisp = params.minDisparity;
        int ndisp = params.numDisparities;

        int width = left0.cols;
        int height = left0.rows;
        int lofs = std::max(ndisp - 1 + mindisp, 0);
        int rofs = -std::min(ndisp - 1 + mindisp, 0);
        int width1 = width - rofs - ndisp + 1;

        if( lofs >= width || rofs >= width || width1 < 1 )
        {
            disp0 = Scalar::all( FILTERED * ( disp0.type() < CV_32F ? 1 : 1./(1 << disp_shift) ) );
            return;
        }

        Mat disp = disp0;
        if( dtype == CV_32F )
        {
            dispbuf.create(disp0.size(), CV_32S);
            disp = dispbuf;
        }

        {
            const double SAD_overhead_coeff = 10.0;
            const double N0 = 8000000 / (params.useShorts() ? 1 : 4);  // approx tbb's min number instructions reasonable for one thread
            const double maxStripeSize = std::min(
                std::max(
                    N0 / (width * ndisp),
                    (params.SADWindowSize-1) * SAD_overhead_coeff
                ),
                (double)height
            );
            const int nstripes = cvCeil(height / maxStripeSize);
            BufferBM localBuf(nstripes, width, height, params);

            // Prefiltering
            parallel_for_(Range(0, 2), PrefilterInvoker(left0, right0, left, right, localBuf, params), 1);


            Rect validDisparityRect(0, 0, width, height), R1 = params.roi1, R2 = params.roi2;
            validDisparityRect = getValidDisparityROI(!R1.empty() ? R1 : validDisparityRect,
                                                      !R2.empty() ? R2 : validDisparityRect,
                                                      params.minDisparity, params.numDisparities,
                                                      params.SADWindowSize);

            FindStereoCorrespInvoker invoker(left, right, disp, params, nstripes, validDisparityRect, cost, localBuf);
            parallel_for_(Range(0, nstripes), invoker);

            if (params.useFilterSpeckles())
            {
                slidingSumBuf.create( 1, width * height * (sizeof(Point_<short>) + sizeof(int) + sizeof(uchar)), CV_8U );
                filterSpeckles(disp, FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf);
            }

        }

        if (disp0.data != disp.data)
            disp.convertTo(disp0, disp0.type(), 1./(1 << disp_shift), 0);
    }

    int getMinDisparity() const CV_OVERRIDE { return params.minDisparity; }
    void setMinDisparity(int minDisparity) CV_OVERRIDE { params.minDisparity = minDisparity; }

    int getNumDisparities() const CV_OVERRIDE { return params.numDisparities; }
    void setNumDisparities(int numDisparities) CV_OVERRIDE { params.numDisparities = numDisparities; }

    int getBlockSize() const CV_OVERRIDE { return params.SADWindowSize; }
    void setBlockSize(int blockSize) CV_OVERRIDE { params.SADWindowSize = blockSize; }

    int getSpeckleWindowSize() const CV_OVERRIDE { return params.speckleWindowSize; }
    void setSpeckleWindowSize(int speckleWindowSize) CV_OVERRIDE { params.speckleWindowSize = speckleWindowSize; }

    int getSpeckleRange() const CV_OVERRIDE { return params.speckleRange; }
    void setSpeckleRange(int speckleRange) CV_OVERRIDE { params.speckleRange = speckleRange; }

    int getDisp12MaxDiff() const CV_OVERRIDE { return params.disp12MaxDiff; }
    void setDisp12MaxDiff(int disp12MaxDiff) CV_OVERRIDE { params.disp12MaxDiff = disp12MaxDiff; }

    int getPreFilterType() const CV_OVERRIDE { return params.preFilterType; }
    void setPreFilterType(int preFilterType) CV_OVERRIDE { params.preFilterType = preFilterType; }

    int getPreFilterSize() const CV_OVERRIDE { return params.preFilterSize; }
    void setPreFilterSize(int preFilterSize) CV_OVERRIDE { params.preFilterSize = preFilterSize; }

    int getPreFilterCap() const CV_OVERRIDE { return params.preFilterCap; }
    void setPreFilterCap(int preFilterCap) CV_OVERRIDE { params.preFilterCap = preFilterCap; }

    int getTextureThreshold() const CV_OVERRIDE { return params.textureThreshold; }
    void setTextureThreshold(int textureThreshold) CV_OVERRIDE { params.textureThreshold = textureThreshold; }

    int getUniquenessRatio() const CV_OVERRIDE { return params.uniquenessRatio; }
    void setUniquenessRatio(int uniquenessRatio) CV_OVERRIDE { params.uniquenessRatio = uniquenessRatio; }

    int getSmallerBlockSize() const CV_OVERRIDE { return 0; }
    void setSmallerBlockSize(int) CV_OVERRIDE {}

    Rect getROI1() const CV_OVERRIDE { return params.roi1; }
    void setROI1(Rect roi1) CV_OVERRIDE { params.roi1 = roi1; }

    Rect getROI2() const CV_OVERRIDE { return params.roi2; }
    void setROI2(Rect roi2) CV_OVERRIDE { params.roi2 = roi2; }

    void write(FileStorage& fs) const CV_OVERRIDE
    {
        writeFormat(fs);
        fs << "name" << name_
        << "minDisparity" << params.minDisparity
        << "numDisparities" << params.numDisparities
        << "blockSize" << params.SADWindowSize
        << "speckleWindowSize" << params.speckleWindowSize
        << "speckleRange" << params.speckleRange
        << "disp12MaxDiff" << params.disp12MaxDiff
        << "preFilterType" << params.preFilterType
        << "preFilterSize" << params.preFilterSize
        << "preFilterCap" << params.preFilterCap
        << "textureThreshold" << params.textureThreshold
        << "uniquenessRatio" << params.uniquenessRatio;
    }

    void read(const FileNode& fn) CV_OVERRIDE
    {
        FileNode n = fn["name"];
        CV_Assert( n.isString() && String(n) == name_ );
        params.minDisparity = (int)fn["minDisparity"];
        params.numDisparities = (int)fn["numDisparities"];
        params.SADWindowSize = (int)fn["blockSize"];
        params.speckleWindowSize = (int)fn["speckleWindowSize"];
        params.speckleRange = (int)fn["speckleRange"];
        params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
        params.preFilterType = (int)fn["preFilterType"];
        params.preFilterSize = (int)fn["preFilterSize"];
        params.preFilterCap = (int)fn["preFilterCap"];
        params.textureThreshold = (int)fn["textureThreshold"];
        params.uniquenessRatio = (int)fn["uniquenessRatio"];
        params.roi1 = params.roi2 = Rect();
    }

    StereoBMParams params;
    Mat preFilteredImg0, preFilteredImg1, cost, dispbuf;
    Mat slidingSumBuf;

    static const char* name_;
};

const char* StereoBMImpl::name_ = "StereoMatcher.BM";

Ptr<StereoBM> StereoBM::create(int _numDisparities, int _SADWindowSize)
{
    return makePtr<StereoBMImpl>(_numDisparities, _SADWindowSize);
}

}

/* End of file. */