Merge remote-tracking branch 'upstream/master'

Rebase to lates upstream
2025-07-25 14:47:07 +08:00 · 2014-04-13 10:39:32 +01:00 · 2014-04-13 10:39:32 +01:00 · befdef9685
commit befdef9685
parent 34984328fd f104d5be8c
21 changed files with 1437 additions and 804 deletions
--- a/cmake/OpenCVFindLibsGUI.cmake
+++ b/cmake/OpenCVFindLibsGUI.cmake
@ -82,7 +82,7 @@ endif(WITH_OPENGL)
 if(APPLE)
  if(WITH_CARBON)
    set(HAVE_CARBON YES)
-  elseif(NOT IOS)
+  elseif(NOT IOS AND CMAKE_COMPILER_IS_CLANGCXX)
    set(HAVE_COCOA YES)
  endif()
 endif()
--- a/cmake/OpenCVFindLibsVideo.cmake
+++ b/cmake/OpenCVFindLibsVideo.cmake
@ -273,7 +273,7 @@ endif()
 if (NOT IOS)
  if(WITH_QUICKTIME)
    set(HAVE_QUICKTIME YES)
-  elseif(APPLE)
+  elseif(APPLE AND CMAKE_COMPILER_IS_CLANGCXX)
    set(HAVE_QTKIT YES)
  endif()
 endif()
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@ -448,11 +448,13 @@ template<typename T> struct OpNot
    T operator()( T a, T ) const { return ~a; }
 };

+#if (ARITHM_USE_IPP == 1)
 static inline void fixSteps(Size sz, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
 {
    if( sz.height == 1 )
        step1 = step2 = step = sz.width*elemSize;
 }
+#endif

 static void add8u( const uchar* src1, size_t step1,
                   const uchar* src2, size_t step2,
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@ -46,7 +46,6 @@
 namespace cv
 {

-static const int MAX_BLOCK_SIZE = 1024;
 typedef void (*MathFunc)(const void* src, void* dst, int len);

 static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI);
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@ -680,7 +680,8 @@ static bool ocl_countNonZero( InputArray _src, int & res )

 int cv::countNonZero( InputArray _src )
 {
-    CV_Assert( _src.channels() == 1 );
+    int type = _src.type(), cn = CV_MAT_CN(type);
+    CV_Assert( cn == 1 );

 #ifdef HAVE_OPENCL
    int res = -1;
@ -690,8 +691,33 @@ int cv::countNonZero( InputArray _src )
 #endif

    Mat src = _src.getMat();
-    CountNonZeroFunc func = getCountNonZeroTab(src.depth());

+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+    if (src.dims <= 2 || src.isContinuous())
+    {
+        IppiSize roiSize = { src.cols, src.rows };
+        Ipp32s count, srcstep = (Ipp32s)src.step;
+        IppStatus status = (IppStatus)-1;
+
+        if (src.isContinuous())
+        {
+            roiSize.width = (Ipp32s)src.total();
+            roiSize.height = 1;
+            srcstep = (Ipp32s)src.total() * CV_ELEM_SIZE(type);
+        }
+
+        int depth = CV_MAT_DEPTH(type);
+        if (depth == CV_8U)
+            status = ippiCountInRange_8u_C1R((const Ipp8u *)src.data, srcstep, roiSize, &count, 0, 0);
+        else if (depth == CV_32F)
+            status = ippiCountInRange_32f_C1R((const Ipp32f *)src.data, srcstep, roiSize, &count, 0, 0);
+
+        if (status >= 0)
+            return (Ipp32s)src.total() - count;
+    }
+#endif
+
+    CountNonZeroFunc func = getCountNonZeroTab(src.depth());
    CV_Assert( func != 0 );

    const Mat* arrays[] = {&src, 0};
--- a/modules/core/test/test_countnonzero.cpp
+++ b/modules/core/test/test_countnonzero.cpp
@ -52,9 +52,6 @@ using namespace std;

 #define sign(a) a > 0 ? 1 : a == 0 ? 0 : -1

-const int FLOAT_TYPE [2] = {CV_32F, CV_64F};
-const int INT_TYPE [5] = {CV_8U, CV_8S, CV_16U, CV_16S, CV_32S};
-
 #define MAX_WIDTH 100
 #define MAX_HEIGHT 100

--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@ -405,8 +405,16 @@ public:
 protected:
    virtual void computeImpl( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const;
    void buildPattern();
-    uchar meanIntensity( InputArray image, InputArray integral, const float kp_x, const float kp_y,
-                         const unsigned int scale, const unsigned int rot, const unsigned int point ) const;
+
+    template <typename imgType, typename iiType>
+    imgType meanIntensity( InputArray image, InputArray integral, const float kp_x, const float kp_y,
+                           const unsigned int scale, const unsigned int rot, const unsigned int point ) const;
+
+    template <typename srcMatType, typename iiMatType>
+    void computeDescriptors( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const;
+
+    template <typename srcMatType>
+    void extractDescriptor(srcMatType *pointsValue, void ** ptr) const;

    bool orientationNormalized; //true if the orientation is normalized, false otherwise
    bool scaleNormalized; //true if the scale is normalized, false otherwise
--- a/modules/features2d/src/freak.cpp
+++ b/modules/features2d/src/freak.cpp
@ -239,13 +239,129 @@ void FREAK::computeImpl( InputArray _image, std::vector<KeyPoint>& keypoints, Ou

    ((FREAK*)this)->buildPattern();

+    // Convert to gray if not already
+    Mat grayImage = image;
+//    if( image.channels() > 1 )
+//        cvtColor( image, grayImage, COLOR_BGR2GRAY );
+
+    // Use 32-bit integers if we won't overflow in the integral image
+    if ((image.depth() == CV_8U || image.depth() == CV_8S) &&
+        (image.rows * image.cols) < 8388608 ) // 8388608 = 2 ^ (32 - 8(bit depth) - 1(sign bit))
+    {
+        // Create the integral image appropriate for our type & usage
+        if (image.depth() == CV_8U)
+            computeDescriptors<uchar, int>(grayImage, keypoints, _descriptors);
+        else if (image.depth() == CV_8S)
+            computeDescriptors<char, int>(grayImage, keypoints, _descriptors);
+        else
+            CV_Error( Error::StsUnsupportedFormat, "" );
+    } else {
+        // Create the integral image appropriate for our type & usage
+        if ( image.depth() == CV_8U )
+            computeDescriptors<uchar, double>(grayImage, keypoints, _descriptors);
+        else if ( image.depth() == CV_8S )
+            computeDescriptors<char, double>(grayImage, keypoints, _descriptors);
+        else if ( image.depth() == CV_16U )
+            computeDescriptors<ushort, double>(grayImage, keypoints, _descriptors);
+        else if ( image.depth() == CV_16S )
+            computeDescriptors<short, double>(grayImage, keypoints, _descriptors);
+        else
+            CV_Error( Error::StsUnsupportedFormat, "" );
+    }
+}
+
+template <typename srcMatType>
+void FREAK::extractDescriptor(srcMatType *pointsValue, void ** ptr) const
+{
+    std::bitset<FREAK_NB_PAIRS>** ptrScalar = (std::bitset<FREAK_NB_PAIRS>**) ptr;
+
+    // extracting descriptor preserving the order of SSE version
+    int cnt = 0;
+    for( int n = 7; n < FREAK_NB_PAIRS; n += 128)
+    {
+        for( int m = 8; m--; )
+        {
+            int nm = n-m;
+            for(int kk = nm+15*8; kk >= nm; kk-=8, ++cnt)
+            {
+                (*ptrScalar)->set(kk, pointsValue[descriptionPairs[cnt].i] >= pointsValue[descriptionPairs[cnt].j]);
+            }
+        }
+    }
+    --(*ptrScalar);
+}
+
+#if CV_SSE2
+template <>
+void FREAK::extractDescriptor(uchar *pointsValue, void ** ptr) const
+{
+    __m128i** ptrSSE = (__m128i**) ptr;
+
+    // note that comparisons order is modified in each block (but first 128 comparisons remain globally the same-->does not affect the 128,384 bits segmanted matching strategy)
+    int cnt = 0;
+    for( int n = FREAK_NB_PAIRS/128; n-- ; )
+    {
+        __m128i result128 = _mm_setzero_si128();
+        for( int m = 128/16; m--; cnt += 16 )
+        {
+            __m128i operand1 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].i],
+                                            pointsValue[descriptionPairs[cnt+1].i],
+                                            pointsValue[descriptionPairs[cnt+2].i],
+                                            pointsValue[descriptionPairs[cnt+3].i],
+                                            pointsValue[descriptionPairs[cnt+4].i],
+                                            pointsValue[descriptionPairs[cnt+5].i],
+                                            pointsValue[descriptionPairs[cnt+6].i],
+                                            pointsValue[descriptionPairs[cnt+7].i],
+                                            pointsValue[descriptionPairs[cnt+8].i],
+                                            pointsValue[descriptionPairs[cnt+9].i],
+                                            pointsValue[descriptionPairs[cnt+10].i],
+                                            pointsValue[descriptionPairs[cnt+11].i],
+                                            pointsValue[descriptionPairs[cnt+12].i],
+                                            pointsValue[descriptionPairs[cnt+13].i],
+                                            pointsValue[descriptionPairs[cnt+14].i],
+                                            pointsValue[descriptionPairs[cnt+15].i]);
+
+            __m128i operand2 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].j],
+                                            pointsValue[descriptionPairs[cnt+1].j],
+                                            pointsValue[descriptionPairs[cnt+2].j],
+                                            pointsValue[descriptionPairs[cnt+3].j],
+                                            pointsValue[descriptionPairs[cnt+4].j],
+                                            pointsValue[descriptionPairs[cnt+5].j],
+                                            pointsValue[descriptionPairs[cnt+6].j],
+                                            pointsValue[descriptionPairs[cnt+7].j],
+                                            pointsValue[descriptionPairs[cnt+8].j],
+                                            pointsValue[descriptionPairs[cnt+9].j],
+                                            pointsValue[descriptionPairs[cnt+10].j],
+                                            pointsValue[descriptionPairs[cnt+11].j],
+                                            pointsValue[descriptionPairs[cnt+12].j],
+                                            pointsValue[descriptionPairs[cnt+13].j],
+                                            pointsValue[descriptionPairs[cnt+14].j],
+                                            pointsValue[descriptionPairs[cnt+15].j]);
+
+            __m128i workReg = _mm_min_epu8(operand1, operand2); // emulated "not less than" for 8-bit UNSIGNED integers
+            workReg = _mm_cmpeq_epi8(workReg, operand2);        // emulated "not less than" for 8-bit UNSIGNED integers
+
+            workReg = _mm_and_si128(_mm_set1_epi16(short(0x8080 >> m)), workReg); // merge the last 16 bits with the 128bits std::vector until full
+            result128 = _mm_or_si128(result128, workReg);
+        }
+        (**ptrSSE) = result128;
+        ++(*ptrSSE);
+    }
+    (*ptrSSE) -= 8;
+}
+#endif
+
+template <typename srcMatType, typename iiMatType>
+void FREAK::computeDescriptors( InputArray _image, std::vector<KeyPoint>& keypoints, OutputArray _descriptors ) const {
+
+    Mat image = _image.getMat();
    Mat imgIntegral;
-    integral(image, imgIntegral);
+    integral(image, imgIntegral, DataType<iiMatType>::type);
    std::vector<int> kpScaleIdx(keypoints.size()); // used to save pattern scale index corresponding to each keypoints
    const std::vector<int>::iterator ScaleIdxBegin = kpScaleIdx.begin(); // used in std::vector erase function
    const std::vector<cv::KeyPoint>::iterator kpBegin = keypoints.begin(); // used in std::vector erase function
    const float sizeCst = static_cast<float>(FREAK_NB_SCALES/(FREAK_LOG2* nOctaves));
-    uchar pointsValue[FREAK_NB_POINTS];
+    srcMatType pointsValue[FREAK_NB_POINTS];
    int thetaIdx = 0;
    int direction0;
    int direction1;
@ -300,13 +416,10 @@ void FREAK::computeImpl( InputArray _image, std::vector<KeyPoint>& keypoints, Ou
        _descriptors.create((int)keypoints.size(), FREAK_NB_PAIRS/8, CV_8U);
        _descriptors.setTo(Scalar::all(0));
        Mat descriptors = _descriptors.getMat();
-#if CV_SSE2
-        __m128i* ptr= (__m128i*) (descriptors.data+(keypoints.size()-1)*descriptors.step[0]);
-#else
-        std::bitset<FREAK_NB_PAIRS>* ptr = (std::bitset<FREAK_NB_PAIRS>*) (descriptors.data+(keypoints.size()-1)*descriptors.step[0]);
-#endif
-        for( size_t k = keypoints.size(); k--; )
-        {
+
+        void *ptr = descriptors.data+(keypoints.size()-1)*descriptors.step[0];
+
+        for( size_t k = keypoints.size(); k--; ) {
            // estimate orientation (gradient)
            if( !orientationNormalized )
            {
@ -316,9 +429,10 @@ void FREAK::computeImpl( InputArray _image, std::vector<KeyPoint>& keypoints, Ou
            else
            {
                // get the points intensity value in the un-rotated pattern
-                for( int i = FREAK_NB_POINTS; i--; )
-                {
-                    pointsValue[i] = meanIntensity(image, imgIntegral, keypoints[k].pt.x,keypoints[k].pt.y, kpScaleIdx[k], 0, i);
+                for( int i = FREAK_NB_POINTS; i--; ) {
+                    pointsValue[i] = meanIntensity<srcMatType, iiMatType>(image, imgIntegral,
+                                                                          keypoints[k].pt.x, keypoints[k].pt.y,
+                                                                          kpScaleIdx[k], 0, i);
                }
                direction0 = 0;
                direction1 = 0;
@ -339,80 +453,14 @@ void FREAK::computeImpl( InputArray _image, std::vector<KeyPoint>& keypoints, Ou
                    thetaIdx -= FREAK_NB_ORIENTATION;
            }
            // extract descriptor at the computed orientation
-            for( int i = FREAK_NB_POINTS; i--; )
-            {
-                pointsValue[i] = meanIntensity(image, imgIntegral, keypoints[k].pt.x,keypoints[k].pt.y, kpScaleIdx[k], thetaIdx, i);
+            for( int i = FREAK_NB_POINTS; i--; ) {
+                pointsValue[i] = meanIntensity<srcMatType, iiMatType>(image, imgIntegral,
+                                                                      keypoints[k].pt.x, keypoints[k].pt.y,
+                                                                      kpScaleIdx[k], thetaIdx, i);
            }
-#if CV_SSE2
-            // note that comparisons order is modified in each block (but first 128 comparisons remain globally the same-->does not affect the 128,384 bits segmanted matching strategy)
-            int cnt = 0;
-            for( int n = FREAK_NB_PAIRS/128; n-- ; )
-            {
-                __m128i result128 = _mm_setzero_si128();
-                for( int m = 128/16; m--; cnt += 16 )
-                {
-                    __m128i operand1 = _mm_set_epi8(
-                        pointsValue[descriptionPairs[cnt+0].i],
-                        pointsValue[descriptionPairs[cnt+1].i],
-                        pointsValue[descriptionPairs[cnt+2].i],
-                        pointsValue[descriptionPairs[cnt+3].i],
-                        pointsValue[descriptionPairs[cnt+4].i],
-                        pointsValue[descriptionPairs[cnt+5].i],
-                        pointsValue[descriptionPairs[cnt+6].i],
-                        pointsValue[descriptionPairs[cnt+7].i],
-                        pointsValue[descriptionPairs[cnt+8].i],
-                        pointsValue[descriptionPairs[cnt+9].i],
-                        pointsValue[descriptionPairs[cnt+10].i],
-                        pointsValue[descriptionPairs[cnt+11].i],
-                        pointsValue[descriptionPairs[cnt+12].i],
-                        pointsValue[descriptionPairs[cnt+13].i],
-                        pointsValue[descriptionPairs[cnt+14].i],
-                        pointsValue[descriptionPairs[cnt+15].i]);

-                    __m128i operand2 = _mm_set_epi8(
-                        pointsValue[descriptionPairs[cnt+0].j],
-                        pointsValue[descriptionPairs[cnt+1].j],
-                        pointsValue[descriptionPairs[cnt+2].j],
-                        pointsValue[descriptionPairs[cnt+3].j],
-                        pointsValue[descriptionPairs[cnt+4].j],
-                        pointsValue[descriptionPairs[cnt+5].j],
-                        pointsValue[descriptionPairs[cnt+6].j],
-                        pointsValue[descriptionPairs[cnt+7].j],
-                        pointsValue[descriptionPairs[cnt+8].j],
-                        pointsValue[descriptionPairs[cnt+9].j],
-                        pointsValue[descriptionPairs[cnt+10].j],
-                        pointsValue[descriptionPairs[cnt+11].j],
-                        pointsValue[descriptionPairs[cnt+12].j],
-                        pointsValue[descriptionPairs[cnt+13].j],
-                        pointsValue[descriptionPairs[cnt+14].j],
-                        pointsValue[descriptionPairs[cnt+15].j]);
-
-                    __m128i workReg = _mm_min_epu8(operand1, operand2); // emulated "not less than" for 8-bit UNSIGNED integers
-                    workReg = _mm_cmpeq_epi8(workReg, operand2);        // emulated "not less than" for 8-bit UNSIGNED integers
-
-                    workReg = _mm_and_si128(_mm_set1_epi16(short(0x8080 >> m)), workReg); // merge the last 16 bits with the 128bits std::vector until full
-                    result128 = _mm_or_si128(result128, workReg);
-                }
-                (*ptr) = result128;
-                ++ptr;
-            }
-            ptr -= 8;
-#else
-            // extracting descriptor preserving the order of SSE version
-            int cnt = 0;
-            for( int n = 7; n < FREAK_NB_PAIRS; n += 128)
-            {
-                for( int m = 8; m--; )
-                {
-                    int nm = n-m;
-                    for(int kk = nm+15*8; kk >= nm; kk-=8, ++cnt)
-                    {
-                        ptr->set(kk, pointsValue[descriptionPairs[cnt].i] >= pointsValue[descriptionPairs[cnt].j]);
-                    }
-                }
-            }
-            --ptr;
-#endif
+            // Extract descriptor
+            extractDescriptor<srcMatType>(pointsValue, &ptr);
        }
    }
    else // extract all possible comparisons for selection
@ -434,7 +482,9 @@ void FREAK::computeImpl( InputArray _image, std::vector<KeyPoint>& keypoints, Ou
            {
                //get the points intensity value in the un-rotated pattern
                for( int i = FREAK_NB_POINTS;i--; )
-                    pointsValue[i] = meanIntensity(image, imgIntegral, keypoints[k].pt.x,keypoints[k].pt.y, kpScaleIdx[k], 0, i);
+                    pointsValue[i] = meanIntensity<srcMatType, iiMatType>(image, imgIntegral,
+                                                                          keypoints[k].pt.x,keypoints[k].pt.y,
+                                                                          kpScaleIdx[k], 0, i);

                direction0 = 0;
                direction1 = 0;
@ -456,10 +506,10 @@ void FREAK::computeImpl( InputArray _image, std::vector<KeyPoint>& keypoints, Ou
                    thetaIdx -= FREAK_NB_ORIENTATION;
            }
            // get the points intensity value in the rotated pattern
-            for( int i = FREAK_NB_POINTS; i--; )
-            {
-                pointsValue[i] = meanIntensity(image, imgIntegral, keypoints[k].pt.x,
-                                             keypoints[k].pt.y, kpScaleIdx[k], thetaIdx, i);
+            for( int i = FREAK_NB_POINTS; i--; ) {
+                pointsValue[i] = meanIntensity<srcMatType, iiMatType>(image, imgIntegral,
+                                                                      keypoints[k].pt.x, keypoints[k].pt.y,
+                                                                      kpScaleIdx[k], thetaIdx, i);
            }

            int cnt(0);
@ -478,13 +528,13 @@ void FREAK::computeImpl( InputArray _image, std::vector<KeyPoint>& keypoints, Ou
 }

 // simply take average on a square patch, not even gaussian approx
-uchar FREAK::meanIntensity( InputArray _image, InputArray _integral,
-                            const float kp_x,
-                            const float kp_y,
-                            const unsigned int scale,
-                            const unsigned int rot,
-                            const unsigned int point) const
-{
+template <typename imgType, typename iiType>
+imgType FREAK::meanIntensity( InputArray _image, InputArray _integral,
+                              const float kp_x,
+                              const float kp_y,
+                              const unsigned int scale,
+                              const unsigned int rot,
+                              const unsigned int point) const {
    Mat image = _image.getMat(), integral = _integral.getMat();
    // get point position in image
    const PatternPoint& FreakPoint = patternLookup[scale*FREAK_NB_ORIENTATION*FREAK_NB_POINTS + rot*FREAK_NB_POINTS + point];
@ -492,7 +542,6 @@ uchar FREAK::meanIntensity( InputArray _image, InputArray _integral,
    const float yf = FreakPoint.y+kp_y;
    const int x = int(xf);
    const int y = int(yf);
-    const int& imagecols = image.cols;

    // get the sigma:
    const float radius = FreakPoint.sigma;
@ -505,19 +554,15 @@ uchar FREAK::meanIntensity( InputArray _image, InputArray _integral,
        const int r_y = static_cast<int>((yf-y)*1024);
        const int r_x_1 = (1024-r_x);
        const int r_y_1 = (1024-r_y);
-        uchar* ptr = image.data+x+y*imagecols;
        unsigned int ret_val;
        // linear interpolation:
-        ret_val = (r_x_1*r_y_1*int(*ptr));
-        ptr++;
-        ret_val += (r_x*r_y_1*int(*ptr));
-        ptr += imagecols;
-        ret_val += (r_x*r_y*int(*ptr));
-        ptr--;
-        ret_val += (r_x_1*r_y*int(*ptr));
+        ret_val = r_x_1*r_y_1*int(image.at<imgType>(y  , x  ))
+                + r_x  *r_y_1*int(image.at<imgType>(y  , x+1))
+                + r_x_1*r_y  *int(image.at<imgType>(y+1, x  ))
+                + r_x  *r_y  *int(image.at<imgType>(y+1, x+1));
        //return the rounded mean
        ret_val += 2 * 1024 * 1024;
-        return static_cast<uchar>(ret_val / (4 * 1024 * 1024));
+        return static_cast<imgType>(ret_val / (4 * 1024 * 1024));
    }

    // expected case:
@ -527,15 +572,15 @@ uchar FREAK::meanIntensity( InputArray _image, InputArray _integral,
    const int y_top = int(yf-radius+0.5);
    const int x_right = int(xf+radius+1.5);//integral image is 1px wider
    const int y_bottom = int(yf+radius+1.5);//integral image is 1px higher
-    int ret_val;
+    iiType ret_val;

-    ret_val = integral.at<int>(y_bottom,x_right);//bottom right corner
-    ret_val -= integral.at<int>(y_bottom,x_left);
-    ret_val += integral.at<int>(y_top,x_left);
-    ret_val -= integral.at<int>(y_top,x_right);
+    ret_val = integral.at<iiType>(y_bottom,x_right);//bottom right corner
+    ret_val -= integral.at<iiType>(y_bottom,x_left);
+    ret_val += integral.at<iiType>(y_top,x_left);
+    ret_val -= integral.at<iiType>(y_top,x_right);
    ret_val = ret_val/( (x_right-x_left)* (y_bottom-y_top) );
    //~ std::cout<<integral.step[1]<<std::endl;
-    return static_cast<uchar>(ret_val);
+    return static_cast<imgType>(ret_val);
 }

 // pair selection algorithm from a set of training images and corresponding keypoints
--- a/modules/features2d/src/stardetector.cpp
+++ b/modules/features2d/src/stardetector.cpp
@ -44,20 +44,24 @@
 namespace cv
 {

-static void
-computeIntegralImages( const Mat& matI, Mat& matS, Mat& matT, Mat& _FT )
+template <typename inMatType, typename outMatType> static void
+computeIntegralImages( const Mat& matI, Mat& matS, Mat& matT, Mat& _FT,
+                       int iiType )
 {
-    CV_Assert( matI.type() == CV_8U );
-
    int x, y, rows = matI.rows, cols = matI.cols;

-    matS.create(rows + 1, cols + 1, CV_32S);
-    matT.create(rows + 1, cols + 1, CV_32S);
-    _FT.create(rows + 1, cols + 1, CV_32S);
+    matS.create(rows + 1, cols + 1, iiType );
+    matT.create(rows + 1, cols + 1, iiType );
+    _FT.create(rows + 1, cols + 1, iiType );

-    const uchar* I = matI.ptr<uchar>();
-    int *S = matS.ptr<int>(), *T = matT.ptr<int>(), *FT = _FT.ptr<int>();
-    int istep = (int)matI.step, step = (int)(matS.step/sizeof(S[0]));
+    const inMatType* I = matI.ptr<inMatType>();
+
+    outMatType *S = matS.ptr<outMatType>();
+    outMatType *T = matT.ptr<outMatType>();
+    outMatType *FT = _FT.ptr<outMatType>();
+
+    int istep = (int)(matI.step/matI.elemSize());
+    int step = (int)(matS.step/matS.elemSize());

    for( x = 0; x <= cols; x++ )
        S[x] = T[x] = FT[x] = 0;
@ -95,14 +99,9 @@ computeIntegralImages( const Mat& matI, Mat& matS, Mat& matT, Mat& _FT )
    }
 }

-struct StarFeature
-{
-    int area;
-    int* p[8];
-};
-
-static int
-StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int maxSize )
+template <typename iiMatType> static int
+StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes,
+                              int maxSize, int iiType )
 {
    const int MAX_PATTERN = 17;
    static const int sizes0[] = {1, 2, 3, 4, 6, 8, 11, 12, 16, 22, 23, 32, 45, 46, 64, 90, 128, -1};
@ -116,16 +115,21 @@ StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int ma
    __m128 sizes1_4[MAX_PATTERN];
    union { int i; float f; } absmask;
    absmask.i = 0x7fffffff;
-    volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2);
+    volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2) && iiType == CV_32S;
 #endif
+
+    struct StarFeature
+    {
+        int area;
+        iiMatType* p[8];
+    };
+
    StarFeature f[MAX_PATTERN];

    Mat sum, tilted, flatTilted;
    int y, rows = img.rows, cols = img.cols;
    int border, npatterns=0, maxIdx=0;

-    CV_Assert( img.type() == CV_8UC1 );
-
    responses.create( img.size(), CV_32F );
    sizes.create( img.size(), CV_16S );

@ -139,7 +143,18 @@ StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int ma
    npatterns += (pairs[npatterns-1][0] >= 0);
    maxIdx = pairs[npatterns-1][0];

-    computeIntegralImages( img, sum, tilted, flatTilted );
+    // Create the integral image appropriate for our type & usage
+    if ( img.type() == CV_8U )
+        computeIntegralImages<uchar, iiMatType>( img, sum, tilted, flatTilted, iiType );
+    else if ( img.type() == CV_8S )
+        computeIntegralImages<char, iiMatType>( img, sum, tilted, flatTilted, iiType );
+    else if ( img.type() == CV_16U )
+        computeIntegralImages<ushort, iiMatType>( img, sum, tilted, flatTilted, iiType );
+    else if ( img.type() == CV_16S )
+        computeIntegralImages<short, iiMatType>( img, sum, tilted, flatTilted, iiType );
+    else
+        CV_Error( Error::StsUnsupportedFormat, "" );
+
    int step = (int)(sum.step/sum.elemSize());

    for(int i = 0; i <= maxIdx; i++ )
@ -148,15 +163,15 @@ StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int ma
        int ur_area = (2*ur_size + 1)*(2*ur_size + 1);
        int t_area = t_size*t_size + (t_size + 1)*(t_size + 1);

-        f[i].p[0] = sum.ptr<int>() + (ur_size + 1)*step + ur_size + 1;
-        f[i].p[1] = sum.ptr<int>() - ur_size*step + ur_size + 1;
-        f[i].p[2] = sum.ptr<int>() + (ur_size + 1)*step - ur_size;
-        f[i].p[3] = sum.ptr<int>() - ur_size*step - ur_size;
+        f[i].p[0] = sum.ptr<iiMatType>() + (ur_size + 1)*step + ur_size + 1;
+        f[i].p[1] = sum.ptr<iiMatType>() - ur_size*step + ur_size + 1;
+        f[i].p[2] = sum.ptr<iiMatType>() + (ur_size + 1)*step - ur_size;
+        f[i].p[3] = sum.ptr<iiMatType>() - ur_size*step - ur_size;

-        f[i].p[4] = tilted.ptr<int>() + (t_size + 1)*step + 1;
-        f[i].p[5] = flatTilted.ptr<int>() - t_size;
-        f[i].p[6] = flatTilted.ptr<int>() + t_size + 1;
-        f[i].p[7] = tilted.ptr<int>() - t_size*step + 1;
+        f[i].p[4] = tilted.ptr<iiMatType>() + (t_size + 1)*step + 1;
+        f[i].p[5] = flatTilted.ptr<iiMatType>() - t_size;
+        f[i].p[6] = flatTilted.ptr<iiMatType>() + t_size + 1;
+        f[i].p[7] = tilted.ptr<iiMatType>() - t_size*step + 1;

        f[i].area = ur_area + t_area;
        sizes1[i] = sizes0[i];
@ -227,7 +242,7 @@ StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int ma

                for(int i = 0; i <= maxIdx; i++ )
                {
-                    const int** p = (const int**)&f[i].p[0];
+                    const iiMatType** p = (const iiMatType**)&f[i].p[0];
                    __m128i r0 = _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(p[0]+ofs)),
                                               _mm_loadu_si128((const __m128i*)(p[1]+ofs)));
                    __m128i r1 = _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(p[3]+ofs)),
@ -269,9 +284,9 @@ StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int ma

            for(int i = 0; i <= maxIdx; i++ )
            {
-                const int** p = (const int**)&f[i].p[0];
-                vals[i] = p[0][ofs] - p[1][ofs] - p[2][ofs] + p[3][ofs] +
-                    p[4][ofs] - p[5][ofs] - p[6][ofs] + p[7][ofs];
+                const iiMatType** p = (const iiMatType**)&f[i].p[0];
+                vals[i] = (int)(p[0][ofs] - p[1][ofs] - p[2][ofs] + p[3][ofs] +
+                    p[4][ofs] - p[5][ofs] - p[6][ofs] + p[7][ofs]);
            }
            for(int i = 0; i < npatterns; i++ )
            {
@ -429,7 +444,7 @@ StarDetector::StarDetector(int _maxSize, int _responseThreshold,
 void StarDetector::detectImpl( InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask ) const
 {
    Mat image = _image.getMat(), mask = _mask.getMat(), grayImage = image;
-    if( image.type() != CV_8U ) cvtColor( image, grayImage, COLOR_BGR2GRAY );
+    if( image.channels() > 1 ) cvtColor( image, grayImage, COLOR_BGR2GRAY );

    (*this)(grayImage, keypoints);
    KeyPointsFilter::runByPixelsMask( keypoints, mask );
@ -438,7 +453,15 @@ void StarDetector::detectImpl( InputArray _image, std::vector<KeyPoint>& keypoin
 void StarDetector::operator()(const Mat& img, std::vector<KeyPoint>& keypoints) const
 {
    Mat responses, sizes;
-    int border = StarDetectorComputeResponses( img, responses, sizes, maxSize );
+    int border;
+
+    // Use 32-bit integers if we won't overflow in the integral image
+    if ((img.depth() == CV_8U || img.depth() == CV_8S) &&
+        (img.rows * img.cols) < 8388608 ) // 8388608 = 2 ^ (32 - 8(bit depth) - 1(sign bit))
+        border = StarDetectorComputeResponses<int>( img, responses, sizes, maxSize, CV_32S );
+    else
+        border = StarDetectorComputeResponses<double>( img, responses, sizes, maxSize, CV_64F );
+
    keypoints.clear();
    if( border >= 0 )
        StarDetectorSuppressNonmax( responses, sizes, keypoints, border,
--- a/modules/flann/include/opencv2/flann/defines.h
+++ b/modules/flann/include/opencv2/flann/defines.h
@ -107,6 +107,7 @@ enum flann_centers_init_t
    FLANN_CENTERS_RANDOM = 0,
    FLANN_CENTERS_GONZALES = 1,
    FLANN_CENTERS_KMEANSPP = 2,
+    FLANN_CENTERS_GROUPWISE = 3,

    // deprecated constants, should use the FLANN_CENTERS_* ones instead
    CENTERS_RANDOM = 0,
--- a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
+++ b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
@ -257,6 +257,84 @@ private:
    }


+    /**
+     * Chooses the initial centers in a way inspired by Gonzales (by Pierre-Emmanuel Viel):
+     * select the first point of the list as a candidate, then parse the points list. If another
+     * point is further than current candidate from the other centers, test if it is a good center
+     * of a local aggregation. If it is, replace current candidate by this point. And so on...
+     *
+     * Used with KMeansIndex that computes centers coordinates by averaging positions of clusters points,
+     * this doesn't make a real difference with previous methods. But used with HierarchicalClusteringIndex
+     * class that pick centers among existing points instead of computing the barycenters, there is a real
+     * improvement.
+     *
+     * Params:
+     *     k = number of centers
+     *     vecs = the dataset of points
+     *     indices = indices in the dataset
+     * Returns:
+     */
+    void GroupWiseCenterChooser(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
+    {
+        const float kSpeedUpFactor = 1.3f;
+
+        int n = indices_length;
+
+        DistanceType* closestDistSq = new DistanceType[n];
+
+        // Choose one random center and set the closestDistSq values
+        int index = rand_int(n);
+        assert(index >=0 && index < n);
+        centers[0] = dsindices[index];
+
+        for (int i = 0; i < n; i++) {
+            closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
+        }
+
+
+        // Choose each center
+        int centerCount;
+        for (centerCount = 1; centerCount < k; centerCount++) {
+
+            // Repeat several trials
+            double bestNewPot = -1;
+            int bestNewIndex = 0;
+            DistanceType furthest = 0;
+            for (index = 0; index < n; index++) {
+
+                // We will test only the potential of the points further than current candidate
+                if( closestDistSq[index] > kSpeedUpFactor * (float)furthest ) {
+
+                    // Compute the new potential
+                    double newPot = 0;
+                    for (int i = 0; i < n; i++) {
+                        newPot += std::min( distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols)
+                                            , closestDistSq[i] );
+                    }
+
+                    // Store the best result
+                    if ((bestNewPot < 0)||(newPot <= bestNewPot)) {
+                        bestNewPot = newPot;
+                        bestNewIndex = index;
+                        furthest = closestDistSq[index];
+                    }
+                }
+            }
+
+            // Add the appropriate center
+            centers[centerCount] = dsindices[bestNewIndex];
+            for (int i = 0; i < n; i++) {
+                closestDistSq[i] = std::min( distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols)
+                                             , closestDistSq[i] );
+            }
+        }
+
+        centers_length = centerCount;
+
+        delete[] closestDistSq;
+    }
+
+
 public:


@ -290,6 +368,9 @@ public:
        else if (centers_init_==FLANN_CENTERS_KMEANSPP) {
            chooseCenters = &HierarchicalClusteringIndex::chooseCentersKMeanspp;
        }
+        else if (centers_init_==FLANN_CENTERS_GROUPWISE) {
+            chooseCenters = &HierarchicalClusteringIndex::GroupWiseCenterChooser;
+        }
        else {
            throw FLANNException("Unknown algorithm for choosing initial centers.");
        }
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@ -2716,6 +2716,8 @@ struct mRGBA2RGBA

 #ifdef HAVE_OPENCL

+#define DIVUP(total, grain) (((total) + (grain) - 1) / (grain))
+
 static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 {
    bool ok = false;
@ -2729,6 +2731,17 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
    if (depth != CV_8U && depth != CV_16U && depth != CV_32F)
        return false;

+    cv::String opts = format("-D depth=%d -D scn=%d ", depth, scn);
+
+    ocl::Device dev = ocl::Device::getDefault();
+    int pxPerWIy = 1;
+    if (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU))
+    {
+        pxPerWIy = 4;
+    }
+    globalsize[1] = DIVUP(globalsize[1], pxPerWIy);
+    opts +=  format("-D PIX_PER_WI_Y=%d ", pxPerWIy);
+
    switch (code)
    {
    case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR:
@ -2738,7 +2751,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        dcn = code == COLOR_BGR2BGRA || code == COLOR_RGB2BGRA || code == COLOR_BGRA2RGBA ? 4 : 3;
        bool reverse = !(code == COLOR_BGR2BGRA || code == COLOR_BGRA2BGR);
        k.create("RGB", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=%d -D dcn=%d -D bidx=0 -D %s", depth, scn, dcn,
+                 opts + format("-D dcn=%d -D bidx=0 -D %s", dcn,
                        reverse ? "REVERSE" : "ORDER"));
        break;
    }
@ -2752,7 +2765,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        int greenbits = code == COLOR_BGR5652BGR || code == COLOR_BGR5652RGB ||
            code == COLOR_BGR5652BGRA || code == COLOR_BGR5652RGBA ? 6 : 5;
        k.create("RGB5x52RGB", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=2 -D dcn=%d -D bidx=%d -D greenbits=%d", depth, dcn, bidx, greenbits));
+                 opts + format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, greenbits));
        break;
    }
    case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555:
@ -2765,7 +2778,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
            code == COLOR_BGRA2BGR565 || code == COLOR_RGBA2BGR565 ? 6 : 5;
        dcn = 2;
        k.create("RGB2RGB5x5", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=%d -D dcn=2 -D bidx=%d -D greenbits=%d", depth, scn, bidx, greenbits));
+                 opts + format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, greenbits));
        break;
    }
    case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY:
@ -2774,7 +2787,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        dcn = 1;
        int greenbits = code == COLOR_BGR5652GRAY ? 6 : 5;
        k.create("BGR5x52Gray", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=2 -D dcn=1 -D bidx=0 -D greenbits=%d", depth, greenbits));
+                 opts + format("-D dcn=1 -D bidx=0 -D greenbits=%d", greenbits));
        break;
    }
    case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555:
@ -2783,7 +2796,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        dcn = 2;
        int greenbits = code == COLOR_GRAY2BGR565 ? 6 : 5;
        k.create("Gray2BGR5x5", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=1 -D dcn=2 -D bidx=0 -D greenbits=%d", depth, greenbits));
+                 opts + format("-D dcn=2 -D bidx=0 -D greenbits=%d", greenbits));
        break;
    }
    case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY:
@ -2793,8 +2806,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2;
        dcn = 1;
        k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=%d -D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d",
-                        depth, scn, bidx, stripeSize));
+                 opts + format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d",
+                               bidx, stripeSize));
        globalsize[0] = (src.cols + stripeSize-1)/stripeSize;
        break;
    }
@ -2804,7 +2817,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        CV_Assert(scn == 1);
        dcn = code == COLOR_GRAY2BGRA ? 4 : 3;
        k.create("Gray2RGB", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D bidx=0 -D scn=1 -D dcn=%d", depth, dcn));
+                 opts + format("-D bidx=0 -D dcn=%d", dcn));
        break;
    }
    case COLOR_BGR2YUV:
@ -2814,7 +2827,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        bidx = code == COLOR_RGB2YUV ? 0 : 2;
        dcn = 3;
        k.create("RGB2YUV", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx));
+                 opts + format("-D dcn=3 -D bidx=%d", bidx));
        break;
    }
    case COLOR_YUV2BGR:
@ -2824,7 +2837,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        CV_Assert(dcn == 3 || dcn == 4);
        bidx = code == COLOR_YUV2RGB ? 0 : 2;
        k.create("YUV2RGB", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=3 -D dcn=%d -D bidx=%d", depth, dcn, bidx));
+                 opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
        break;
    }
    case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGR_NV12:
@ -2837,7 +2850,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )

        dstSz = Size(sz.width, sz.height * 2 / 3);
        k.create("YUV2RGB_NV12", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=0 -D scn=1 -D dcn=%d -D bidx=%d", dcn, bidx));
+                 opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
        break;
    }
    case COLOR_BGR2YCrCb:
@ -2847,7 +2860,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        bidx = code == COLOR_BGR2YCrCb ? 0 : 2;
        dcn = 3;
        k.create("RGB2YCrCb", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx));
+                 opts + format("-D dcn=3 -D bidx=%d", bidx));
        break;
    }
    case COLOR_YCrCb2BGR:
@ -2858,7 +2871,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
        bidx = code == COLOR_YCrCb2BGR ? 0 : 2;
        k.create("YCrCb2RGB", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=%d -D dcn=%d -D bidx=%d", depth, scn, dcn, bidx));
+                 opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
        break;
    }
    case COLOR_BGR2XYZ: case COLOR_RGB2XYZ:
@ -2904,7 +2917,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        dst = _dst.getUMat();

        k.create("RGB2XYZ", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx));
+                 opts + format("-D dcn=3 -D bidx=%d", bidx));
        if (k.empty())
            return false;
        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
@ -2955,7 +2968,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        dst = _dst.getUMat();

        k.create("XYZ2RGB", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D scn=3 -D dcn=%d -D bidx=%d", depth, dcn, bidx));
+                 opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
        if (k.empty())
            return false;
        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
@ -3010,8 +3023,9 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
            _dst.create(dstSz, CV_8UC3);
            dst = _dst.getUMat();

-            k.create("RGB2HSV", ocl::imgproc::cvtcolor_oclsrc, format("-D depth=%d -D hrange=%d -D bidx=%d -D dcn=3 -D scn=%d",
-                                                                      depth, hrange, bidx, scn));
+            k.create("RGB2HSV", ocl::imgproc::cvtcolor_oclsrc,
+                     opts + format("-D hrange=%d -D bidx=%d -D dcn=3",
+                                   hrange, bidx));
            if (k.empty())
                return false;

@ -3023,7 +3037,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        }
        else
            k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
-                     format("-D depth=%d -D hscale=%ff -D bidx=%d -D scn=%d -D dcn=3", depth, hrange*(1.f/360.f), bidx, scn));
+                     opts + format("-D hscale=%ff -D bidx=%d -D dcn=3",
+                                   hrange*(1.f/360.f), bidx));
        break;
    }
    case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
@ -3041,8 +3056,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )

        String kernelName = String(is_hsv ? "HSV" : "HLS") + "2RGB";
        k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D dcn=%d -D scn=3 -D bidx=%d -D hrange=%d -D hscale=%ff",
-                        depth, dcn, bidx, hrange, 6.f/hrange));
+                 opts + format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff",
+                               dcn, bidx, hrange, 6.f/hrange));
        break;
    }
    case COLOR_RGBA2mRGBA: case COLOR_mRGBA2RGBA:
@ -3051,7 +3066,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        dcn = 4;

        k.create(code == COLOR_RGBA2mRGBA ? "RGBA2mRGBA" : "mRGBA2RGBA", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D dcn=4 -D scn=4 -D bidx=3", depth));
+                 opts + "-D dcn=4 -D bidx=3");
        break;
    }
    case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
@ -3063,8 +3078,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        dcn = 3;

        k.create("BGR2Lab", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D dcn=3 -D scn=%d -D bidx=%d%s",
-                        depth, scn, bidx, srgb ? " -D SRGB" : ""));
+                 opts + format("-D dcn=3 -D bidx=%d%s",
+                               bidx, srgb ? " -D SRGB" : ""));
        if (k.empty())
            return false;

@ -3165,8 +3180,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB;

        k.create("Lab2BGR", ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D dcn=%d -D scn=3 -D bidx=%d%s",
-                        depth, dcn, bidx, srgb ? " -D SRGB" : ""));
+                 opts + format("-D dcn=%d -D bidx=%d%s",
+                               dcn, bidx, srgb ? " -D SRGB" : ""));
        if (k.empty())
            return false;

--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@ -1175,6 +1175,48 @@ calcHist_8u( std::vector<uchar*>& _ptrs, const std::vector<int>& _deltas,
    }
 }

+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+
+class IPPCalcHistInvoker :
+    public ParallelLoopBody
+{
+public:
+    IPPCalcHistInvoker(const Mat & _src, Mat & _hist, AutoBuffer<Ipp32s> & _levels, Ipp32s _histSize, Ipp32s _low, Ipp32s _high, bool * _ok) :
+        ParallelLoopBody(), src(&_src), hist(&_hist), levels(&_levels), histSize(_histSize), low(_low), high(_high), ok(_ok)
+    {
+        *ok = true;
+    }
+
+    virtual void operator() (const Range & range) const
+    {
+        Mat phist(hist->size(), hist->type(), Scalar::all(0));
+
+        IppStatus status = ippiHistogramEven_8u_C1R(
+            src->data + src->step * range.start, (int)src->step, ippiSize(src->cols, range.end - range.start),
+            (Ipp32s *)phist.data, (Ipp32s *)*levels, histSize, low, high);
+
+        if (status < 0)
+        {
+            *ok = false;
+            return;
+        }
+
+        for (int i = 0; i < histSize; ++i)
+            CV_XADD((int *)(hist->data + i * hist->step), *(int *)(phist.data + i * phist.step));
+    }
+
+private:
+    const Mat * src;
+    Mat * hist;
+    AutoBuffer<Ipp32s> * levels;
+    Ipp32s histSize, low, high;
+    bool * ok;
+
+    const IPPCalcHistInvoker & operator = (const IPPCalcHistInvoker & );
+};
+
+#endif
+
 }

 void cv::calcHist( const Mat* images, int nimages, const int* channels,
@ -1190,6 +1232,32 @@ void cv::calcHist( const Mat* images, int nimages, const int* channels,
    Mat hist = _hist.getMat(), ihist = hist;
    ihist.flags = (ihist.flags & ~CV_MAT_TYPE_MASK)|CV_32S;

+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+    if (nimages == 1 && images[0].type() == CV_8UC1 && dims == 1 && channels &&
+            channels[0] == 0 && mask.empty() && images[0].dims <= 2 &&
+            !accumulate && uniform)
+    {
+        ihist.setTo(Scalar::all(0));
+        AutoBuffer<Ipp32s> levels(histSize[0] + 1);
+
+        bool ok = true;
+        const Mat & src = images[0];
+        int nstripes = std::min<int>(8, src.total() / (1 << 16));
+#ifdef HAVE_CONCURRENCY
+        nstripes = 1;
+#endif
+        IPPCalcHistInvoker invoker(src, ihist, levels, histSize[0] + 1, (Ipp32s)ranges[0][0], (Ipp32s)ranges[0][1], &ok);
+        Range range(0, src.rows);
+        parallel_for_(range, invoker, nstripes);
+
+        if (ok)
+        {
+            ihist.convertTo(hist, CV_32F);
+            return;
+        }
+    }
+#endif
+
    if( !accumulate || histdata != hist.data )
        hist = Scalar(0.);
    else
@ -1477,7 +1545,7 @@ void cv::calcHist( InputArrayOfArrays images, const std::vector<int>& channels,
    CV_OCL_RUN(images.total() == 1 && channels.size() == 1 && images.channels(0) == 1 &&
               channels[0] == 0 && images.isUMatVector() && mask.empty() && !accumulate &&
               histSize.size() == 1 && histSize[0] == BINS && ranges.size() == 2 &&
-               ranges[0] == 0 && ranges[1] == 256,
+               ranges[0] == 0 && ranges[1] == BINS,
               ocl_calcHist(images, hist))

    int i, dims = (int)histSize.size(), rsz = (int)ranges.size(), csz = (int)channels.size();
--- a/modules/imgproc/src/moments.cpp
+++ b/modules/imgproc/src/moments.cpp
@ -466,6 +466,61 @@ cv::Moments cv::moments( InputArray _src, bool binary )
        if( cn > 1 )
            CV_Error( CV_StsBadArg, "Invalid image type (must be single-channel)" );

+#if (IPP_VERSION_X100 >= 801)
+        if (!binary)
+        {
+            IppiSize roi = {mat.cols, mat.rows};
+            IppiMomentState_64f *moment;
+            // ippiMomentInitAlloc_64f, ippiMomentFree_64f are deprecated in 8.1, but there are not another way
+            // to initialize IppiMomentState_64f. When GetStateSize and Init functions will appear we have to
+            // change our code.
+            if (0 <= ippiMomentInitAlloc_64f(&moment, ippAlgHintAccurate))
+            {
+                IppStatus sts = (IppStatus)(-1);
+                if (depth == CV_8U)
+                    sts = ippiMoments64f_8u_C1R((const Ipp8u *)mat.data, (int)mat.step, roi, moment);
+                else if( depth == CV_16U )
+                    sts = ippiMoments64f_16u_C1R((const Ipp16u *)mat.data, (int)mat.step, roi, moment);
+                else if( depth == CV_32F )
+                    sts = ippiMoments64f_32f_C1R((const Ipp32f *)mat.data, (int)mat.step, roi, moment);
+                if (0 <= sts)
+                {
+                    IppiPoint point = {0, 0};
+                    ippiGetSpatialMoment_64f(moment, 0, 0, 0, point, &m.m00);
+                    ippiGetSpatialMoment_64f(moment, 1, 0, 0, point, &m.m10);
+                    ippiGetSpatialMoment_64f(moment, 0, 1, 0, point, &m.m01);
+
+                    ippiGetSpatialMoment_64f(moment, 2, 0, 0, point, &m.m20);
+                    ippiGetSpatialMoment_64f(moment, 1, 1, 0, point, &m.m11);
+                    ippiGetSpatialMoment_64f(moment, 0, 2, 0, point, &m.m02);
+
+                    ippiGetSpatialMoment_64f(moment, 3, 0, 0, point, &m.m30);
+                    ippiGetSpatialMoment_64f(moment, 2, 1, 0, point, &m.m21);
+                    ippiGetSpatialMoment_64f(moment, 1, 2, 0, point, &m.m12);
+                    ippiGetSpatialMoment_64f(moment, 0, 3, 0, point, &m.m03);
+                    ippiGetCentralMoment_64f(moment, 2, 0, 0, &m.mu20);
+                    ippiGetCentralMoment_64f(moment, 1, 1, 0, &m.mu11);
+                    ippiGetCentralMoment_64f(moment, 0, 2, 0, &m.mu02);
+                    ippiGetCentralMoment_64f(moment, 3, 0, 0, &m.mu30);
+                    ippiGetCentralMoment_64f(moment, 2, 1, 0, &m.mu21);
+                    ippiGetCentralMoment_64f(moment, 1, 2, 0, &m.mu12);
+                    ippiGetCentralMoment_64f(moment, 0, 3, 0, &m.mu03);
+                    ippiGetNormalizedCentralMoment_64f(moment, 2, 0, 0, &m.nu20);
+                    ippiGetNormalizedCentralMoment_64f(moment, 1, 1, 0, &m.nu11);
+                    ippiGetNormalizedCentralMoment_64f(moment, 0, 2, 0, &m.nu02);
+                    ippiGetNormalizedCentralMoment_64f(moment, 3, 0, 0, &m.nu30);
+                    ippiGetNormalizedCentralMoment_64f(moment, 2, 1, 0, &m.nu21);
+                    ippiGetNormalizedCentralMoment_64f(moment, 1, 2, 0, &m.nu12);
+                    ippiGetNormalizedCentralMoment_64f(moment, 0, 3, 0, &m.nu03);
+
+                    ippiMomentFree_64f(moment);
+                    return m;
+                }
+                ippiMomentFree_64f(moment);
+            }
+        }
+#endif
+
        if( binary || depth == CV_8U )
            func = momentsInTile<uchar, int, int>;
        else if( depth == CV_16U )
--- a/modules/imgproc/src/opencl/cvtcolor.cl
+++ b/modules/imgproc/src/opencl/cvtcolor.cl
--- a/modules/imgproc/src/samplers.cpp
+++ b/modules/imgproc/src/samplers.cpp
@ -172,7 +172,7 @@ void getRectSubPix_Cn_(const _Tp* src, size_t src_step, Size src_size,
                dst[j+1] = cast_op(s1);
            }

-            for( j = 0; j < win_size.width; j++ )
+            for( ; j < win_size.width; j++ )
            {
                _WTp s0 = src[j]*a11 + src[j+cn]*a12 + src[j+src_step]*a21 + src[j+src_step+cn]*a22;
                dst[j] = cast_op(s0);
--- a/modules/imgproc/src/sumpixels.cpp
+++ b/modules/imgproc/src/sumpixels.cpp
@ -219,6 +219,8 @@ static void integral_##suffix( T* src, size_t srcstep, ST* sum, size_t sumstep,
 DEF_INTEGRAL_FUNC(8u32s, uchar, int, double)
 DEF_INTEGRAL_FUNC(8u32f64f, uchar, float, double)
 DEF_INTEGRAL_FUNC(8u64f64f, uchar, double, double)
+DEF_INTEGRAL_FUNC(16u64f64f, ushort, double, double)
+DEF_INTEGRAL_FUNC(16s64f64f, short, double, double)
 DEF_INTEGRAL_FUNC(32f32f64f, float, float, double)
 DEF_INTEGRAL_FUNC(32f64f64f, float, double, double)
 DEF_INTEGRAL_FUNC(64f64f64f, double, double, double)
@ -411,6 +413,10 @@ void cv::integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, Output
        func = (IntegralFunc)integral_8u32f32f;
    else if( depth == CV_8U && sdepth == CV_64F && sqdepth == CV_64F )
        func = (IntegralFunc)integral_8u64f64f;
+    else if( depth == CV_16U && sdepth == CV_64F && sqdepth == CV_64F )
+        func = (IntegralFunc)integral_16u64f64f;
+    else if( depth == CV_16S && sdepth == CV_64F && sqdepth == CV_64F )
+        func = (IntegralFunc)integral_16s64f64f;
    else if( depth == CV_32F && sdepth == CV_32F && sqdepth == CV_64F )
        func = (IntegralFunc)integral_32f32f64f;
    else if( depth == CV_32F && sdepth == CV_32F && sqdepth == CV_32F )
--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@ -341,10 +341,93 @@ static bool ocl_matchTemplate( InputArray _img, InputArray _templ, OutputArray _

 #endif

+#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
+
+typedef IppStatus (CV_STDCALL * ippimatchTemplate)(const void*, int, IppiSize, const void*, int, IppiSize, Ipp32f* , int , IppEnum , Ipp8u*);
+
+static bool ipp_crossCorr(const Mat& src, const Mat& tpl, Mat& dst)
+{
+    if (src.channels()!= 1)
+        return false;
+
+    IppStatus status;
+
+    IppiSize srcRoiSize = {src.cols,src.rows};
+    IppiSize tplRoiSize = {tpl.cols,tpl.rows};
+
+    Ipp8u *pBuffer;
+    int bufSize=0;
+
+    int depth = src.depth();
+
+    ippimatchTemplate ippFunc =
+            depth==CV_8U ? (ippimatchTemplate)ippiCrossCorrNorm_8u32f_C1R:
+            depth==CV_32F? (ippimatchTemplate)ippiCrossCorrNorm_32f_C1R: 0;
+
+    if (ippFunc==0)
+        return false;
+
+    IppEnum funCfg = (IppEnum)(ippAlgAuto | ippiNormNone | ippiROIValid);
+
+    status = ippiCrossCorrNormGetBufferSize(srcRoiSize, tplRoiSize, funCfg, &bufSize);
+    if ( status < 0 )
+        return false;
+
+    pBuffer = ippsMalloc_8u( bufSize );
+
+    status = ippFunc(src.data, (int)src.step, srcRoiSize, tpl.data, (int)tpl.step, tplRoiSize, (Ipp32f*)dst.data, (int)dst.step, funCfg, pBuffer);
+
+    ippsFree( pBuffer );
+    return status >= 0;
+}
+
+static bool ipp_sqrDistance(const Mat& src, const Mat& tpl, Mat& dst)
+{
+    if (src.channels()!= 1)
+        return false;
+
+    IppStatus status;
+
+    IppiSize srcRoiSize = {src.cols,src.rows};
+    IppiSize tplRoiSize = {tpl.cols,tpl.rows};
+
+    Ipp8u *pBuffer;
+    int bufSize=0;
+
+    int depth = src.depth();
+
+    ippimatchTemplate ippFunc =
+            depth==CV_8U ? (ippimatchTemplate)ippiSqrDistanceNorm_8u32f_C1R:
+            depth==CV_32F? (ippimatchTemplate)ippiSqrDistanceNorm_32f_C1R: 0;
+
+    if (ippFunc==0)
+        return false;
+
+    IppEnum funCfg = (IppEnum)(ippAlgAuto | ippiNormNone | ippiROIValid);
+
+    status = ippiSqrDistanceNormGetBufferSize(srcRoiSize, tplRoiSize, funCfg, &bufSize);
+    if ( status < 0 )
+        return false;
+
+    pBuffer = ippsMalloc_8u( bufSize );
+
+    status = ippFunc(src.data, (int)src.step, srcRoiSize, tpl.data, (int)tpl.step, tplRoiSize, (Ipp32f*)dst.data, (int)dst.step, funCfg, pBuffer);
+
+    ippsFree( pBuffer );
+    return status >= 0;
+}
+
+#endif
+
 void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
                Size corrsize, int ctype,
                Point anchor, double delta, int borderType )
 {
+#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
+    if (ipp_crossCorr(img, _templ, corr))
+        return;
+#endif
+
    const double blockScale = 4.5;
    const int minBlockSize = 256;
    std::vector<uchar> buf;
@ -560,6 +643,11 @@ void cv::matchTemplate( InputArray _img, InputArray _templ, OutputArray _result,
        return;
 #endif

+#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
+    if (method == CV_TM_SQDIFF && ipp_sqrDistance(img, templ, result))
+        return;
+#endif
+
    int cn = img.channels();
    crossCorr( img, templ, result, result.size(), result.type(), Point(0,0), 0, 0);

--- a/modules/nonfree/src/sift.cpp
+++ b/modules/nonfree/src/sift.cpp
@ -111,21 +111,6 @@ namespace cv

 /******************************* Defs and macros *****************************/

-// default number of sampled intervals per octave
-static const int SIFT_INTVLS = 3;
-
-// default sigma for initial gaussian smoothing
-static const float SIFT_SIGMA = 1.6f;
-
-// default threshold on keypoint contrast |D(x)|
-static const float SIFT_CONTR_THR = 0.04f;
-
-// default threshold on keypoint ratio of principle curvatures
-static const float SIFT_CURV_THR = 10.f;
-
-// double image size before pyramid construction?
-static const bool SIFT_IMG_DBL = true;
-
 // default width of descriptor histogram array
 static const int SIFT_DESCR_WIDTH = 4;

--- a/modules/video/src/simpleflow.cpp
+++ b/modules/video/src/simpleflow.cpp
@ -66,21 +66,6 @@ inline static float dist(const Vec2f& p1, const Vec2f& p2) {
         (p1[1] - p2[1]) * (p1[1] - p2[1]);
 }

-inline static float dist(const Point2f& p1, const Point2f& p2) {
-  return (p1.x - p2.x) * (p1.x - p2.x) +
-         (p1.y - p2.y) * (p1.y - p2.y);
-}
-
-inline static float dist(float x1, float y1, float x2, float y2) {
-  return (x1 - x2) * (x1 - x2) +
-         (y1 - y2) * (y1 - y2);
-}
-
-inline static int dist(int x1, int y1, int x2, int y2) {
-  return (x1 - x2) * (x1 - x2) +
-         (y1 - y2) * (y1 - y2);
-}
-
 template<class T>
 inline static T min(T t1, T t2, T t3) {
  return (t1 <= t2 && t1 <= t3) ? t1 : min(t2, t3);
--- a/samples/cpp/CMakeLists.txt
+++ b/samples/cpp/CMakeLists.txt
@ -93,6 +93,10 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)

  ocv_list_filterout(cpp_samples "viz")

+  if(NOT HAVE_IPP_A)
+    ocv_list_filterout(cpp_samples "/ippasync/")
+  endif()
+
  foreach(sample_filename ${cpp_samples})
    get_filename_component(sample ${sample_filename} NAME_WE)
    OPENCV_DEFINE_CPP_EXAMPLE(${sample}  ${sample_filename})