diff --git a/3rdparty/tbb/.gitignore b/3rdparty/tbb/.gitignore new file mode 100644 index 0000000000..601e1b265e --- /dev/null +++ b/3rdparty/tbb/.gitignore @@ -0,0 +1 @@ +tbb*.tgz \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 62709b8050..e048156a64 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -140,6 +140,9 @@ OCV_OPTION(WITH_XIMEA "Include XIMEA cameras support" OFF OCV_OPTION(WITH_XINE "Include Xine support (GPL)" OFF IF (UNIX AND NOT APPLE AND NOT ANDROID) ) OCV_OPTION(WITH_CLP "Include Clp support (EPL)" OFF) OCV_OPTION(WITH_OPENCL "Include OpenCL Runtime support" OFF IF (NOT ANDROID AND NOT IOS) ) +OCV_OPTION(WITH_OPENCLAMDFFT "Include AMD OpenCL FFT library support" OFF IF (NOT ANDROID AND NOT IOS) ) +OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" OFF IF (NOT ANDROID AND NOT IOS) ) + # OpenCV build components # =================================================== @@ -396,6 +399,12 @@ if(WITH_OPENCL) if(OPENCL_FOUND) set(HAVE_OPENCL 1) endif() + if(WITH_OPENCLAMDFFT) + set(HAVE_CLAMDFFT 1) + endif() + if(WITH_OPENCLAMDBLAS) + set(HAVE_CLAMDBLAS 1) + endif() endif() # ---------------------------------------------------------------------------- diff --git a/android/android.toolchain.cmake b/android/android.toolchain.cmake index 324074c8f2..3c89806ac0 100644 --- a/android/android.toolchain.cmake +++ b/android/android.toolchain.cmake @@ -4,7 +4,7 @@ # See home page: http://code.google.com/p/android-cmake/ # # The file is mantained by the OpenCV project. And also can be found at -# http://code.opencv.org/svn/opencv/trunk/opencv/android/android.toolchain.cmake +# http://code.opencv.org/projects/opencv/repository/revisions/master/changes/android/android.toolchain.cmake # # Usage Linux: # $ export ANDROID_NDK=/absolute/path/to/the/android-ndk @@ -182,6 +182,7 @@ # [+] added mips architecture support # - modified August 2012 # [+] updated for NDK r8b +# [~] all intermediate files generated by toolchain are moved into CMakeFiles # ------------------------------------------------------------------------------ cmake_minimum_required( VERSION 2.6.3 ) @@ -854,45 +855,48 @@ elseif( X86 ) endif() #linker flags -list( APPEND ANDROID_SYSTEM_LIB_DIRS "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}" "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" ) +if( NOT DEFINED __ndklibspath ) + set( __ndklibspath "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/ndklibs/${ANDROID_NDK_ABI_NAME}" ) +endif() +list( APPEND ANDROID_SYSTEM_LIB_DIRS "${__ndklibspath}" "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" ) set( ANDROID_LINKER_FLAGS "" ) #STL if( ANDROID_USE_STLPORT ) if( EXISTS "${__stlLibPath}/libstlport_static.a" ) - __COPY_IF_DIFFERENT( "${__stlLibPath}/libstlport_static.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libstlport_static.a" ) + __COPY_IF_DIFFERENT( "${__stlLibPath}/libstlport_static.a" "${__ndklibspath}/libstlport_static.a" ) endif() - if( EXISTS "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libstlport_static.a" ) + if( EXISTS "${__ndklibspath}/libstlport_static.a" ) set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--start-group -lstlport_static" ) endif() else( ANDROID_USE_STLPORT ) if( EXISTS "${__stlLibPath}/libgnustl_static.a" ) - __COPY_IF_DIFFERENT( "${__stlLibPath}/libgnustl_static.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libstdc++.a" ) + __COPY_IF_DIFFERENT( "${__stlLibPath}/libgnustl_static.a" "${__ndklibspath}/libstdc++.a" ) elseif( ANDROID_ARCH_NAME STREQUAL "arm" AND EXISTS "${__stlLibPath}/${CMAKE_SYSTEM_PROCESSOR}/thumb/libstdc++.a" ) - __COPY_IF_DIFFERENT( "${__stlLibPath}/${CMAKE_SYSTEM_PROCESSOR}/thumb/libstdc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libstdc++.a" ) + __COPY_IF_DIFFERENT( "${__stlLibPath}/${CMAKE_SYSTEM_PROCESSOR}/thumb/libstdc++.a" "${__ndklibspath}/libstdc++.a" ) elseif( ANDROID_ARCH_NAME STREQUAL "arm" AND EXISTS "${__stlLibPath}/${CMAKE_SYSTEM_PROCESSOR}/libstdc++.a" ) - __COPY_IF_DIFFERENT( "${__stlLibPath}/${CMAKE_SYSTEM_PROCESSOR}/libstdc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libstdc++.a" ) + __COPY_IF_DIFFERENT( "${__stlLibPath}/${CMAKE_SYSTEM_PROCESSOR}/libstdc++.a" "${__ndklibspath}/libstdc++.a" ) elseif( ANDROID_ARCH_NAME STREQUAL "arm" AND EXISTS "${__stlLibPath}/thumb/libstdc++.a" ) - __COPY_IF_DIFFERENT( "${__stlLibPath}/thumb/libstdc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libstdc++.a" ) + __COPY_IF_DIFFERENT( "${__stlLibPath}/thumb/libstdc++.a" "${__ndklibspath}/libstdc++.a" ) elseif( EXISTS "${__stlLibPath}/libstdc++.a" ) - __COPY_IF_DIFFERENT( "${__stlLibPath}/libstdc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libstdc++.a" ) + __COPY_IF_DIFFERENT( "${__stlLibPath}/libstdc++.a" "${__ndklibspath}/libstdc++.a" ) endif() - if( EXISTS "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libstdc++.a" ) + if( EXISTS "${__ndklibspath}/libstdc++.a" ) set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -lstdc++" ) endif() #gcc exception & rtti support if( EXISTS "${__stlLibPath}/libsupc++.a" ) - __COPY_IF_DIFFERENT( "${__stlLibPath}/libsupc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) + __COPY_IF_DIFFERENT( "${__stlLibPath}/libsupc++.a" "${__ndklibspath}/libsupc++.a" ) elseif( ANDROID_ARCH_NAME STREQUAL "arm" AND EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb/libsupc++.a" ) - __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb/libsupc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) + __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb/libsupc++.a" "${__ndklibspath}/libsupc++.a" ) elseif( ANDROID_ARCH_NAME STREQUAL "arm" AND EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libsupc++.a" ) - __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libsupc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) + __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libsupc++.a" "${__ndklibspath}/libsupc++.a" ) elseif( ANDROID_ARCH_NAME STREQUAL "arm" AND EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libsupc++.a" ) - __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libsupc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) + __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libsupc++.a" "${__ndklibspath}/libsupc++.a" ) elseif( EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libsupc++.a" ) - __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libsupc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) + __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libsupc++.a" "${__ndklibspath}/libsupc++.a" ) endif() - if( EXISTS "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) + if( EXISTS "${__ndklibspath}/libsupc++.a" ) set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -lsupc++" ) endif() endif( ANDROID_USE_STLPORT ) @@ -1038,13 +1042,14 @@ endmacro() # export toolchain settings for the try_compile() command if( NOT PROJECT_NAME STREQUAL "CMAKE_TRY_COMPILE" ) set( __toolchain_config "") - foreach( __var ANDROID_ABI ANDROID_FORCE_ARM_BUILD ANDROID_NATIVE_API_LEVEL ANDROID_NO_UNDEFINED ANDROID_SO_UNDEFINED ANDROID_SET_OBSOLETE_VARIABLES LIBRARY_OUTPUT_PATH_ROOT ANDROID_USE_STLPORT ANDROID_FORBID_SYGWIN ANDROID_NDK ANDROID_STANDALONE_TOOLCHAIN ANDROID_FUNCTION_LEVEL_LINKING ) + foreach( __var ANDROID_ABI ANDROID_FORCE_ARM_BUILD ANDROID_NATIVE_API_LEVEL ANDROID_NO_UNDEFINED ANDROID_SO_UNDEFINED ANDROID_SET_OBSOLETE_VARIABLES LIBRARY_OUTPUT_PATH_ROOT ANDROID_USE_STLPORT ANDROID_FORBID_SYGWIN ANDROID_NDK ANDROID_STANDALONE_TOOLCHAIN ANDROID_FUNCTION_LEVEL_LINKING __ndklibspath ) if( DEFINED ${__var} ) set( __toolchain_config "${__toolchain_config}set( ${__var} \"${${__var}}\" )\n" ) endif() endforeach() - file( WRITE "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/android.toolchain.config.cmake" "${__toolchain_config}" ) + file( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/android.toolchain.config.cmake" "${__toolchain_config}" ) unset( __toolchain_config ) + unset( __ndklibspath ) endif() @@ -1073,6 +1078,7 @@ endif() # Can be set only at the first run: # ANDROID_NDK # ANDROID_STANDALONE_TOOLCHAIN +# ANDROID_TOOLCHAIN_NAME : "arm-linux-androideabi-4.4.3" or "arm-linux-androideabi-4.6" or "mipsel-linux-android-4.4.3" or "mipsel-linux-android-4.6" or "x86-4.4.3" or "x86-4.6" # Obsolete: # ANDROID_API_LEVEL : superseded by ANDROID_NATIVE_API_LEVEL # ARM_TARGET : superseded by ANDROID_ABI @@ -1105,7 +1111,6 @@ endif() # ANDROID_COMPILER_VERSION : GCC version used # ANDROID_CXX_FLAGS : C/C++ compiler flags required by Android platform # ANDROID_SUPPORTED_ABIS : list of currently allowed values for ANDROID_ABI -# ANDROID_TOOLCHAIN_NAME : "standalone", "arm-linux-androideabi-4.4.3" or "x86-4.4.3" or something similar. # ANDROID_TOOLCHAIN_MACHINE_NAME : "arm-linux-androideabi", "arm-eabi" or "i686-android-linux" # ANDROID_TOOLCHAIN_ROOT : path to the top level of toolchain (standalone or placed inside NDK) # ANDROID_SUPPORTED_NATIVE_API_LEVELS : list of native API levels found inside NDK diff --git a/cmake/OpenCVDetectOpenCL.cmake b/cmake/OpenCVDetectOpenCL.cmake index 903b55b652..96473a5003 100644 --- a/cmake/OpenCVDetectOpenCL.cmake +++ b/cmake/OpenCVDetectOpenCL.cmake @@ -2,8 +2,19 @@ if(APPLE) set(OPENCL_FOUND YES) set(OPENCL_LIBRARIES "-framework OpenCL") else() - find_package(OpenCL QUIET) - + #find_package(OpenCL QUIET) + if(WITH_OPENCLAMDFFT) + find_path(CLAMDFFT_INCLUDE_DIR + NAMES clAmdFft.h) + find_library(CLAMDFFT_LIBRARIES + NAMES clAmdFft.Runtime) + endif() + if(WITH_OPENCLAMDBLAS) + find_path(CLAMDBLAS_INCLUDE_DIR + NAMES clAmdBlas.h) + find_library(CLAMDBLAS_LIBRARIES + NAMES clAmdBlas) + endif() # Try AMD/ATI Stream SDK if (NOT OPENCL_FOUND) set(ENV_AMDSTREAMSDKROOT $ENV{AMDAPPSDKROOT}) diff --git a/cmake/templates/cvconfig.h.cmake b/cmake/templates/cvconfig.h.cmake index 1012008059..fb779c887d 100644 --- a/cmake/templates/cvconfig.h.cmake +++ b/cmake/templates/cvconfig.h.cmake @@ -175,6 +175,12 @@ /* OpenCL Support */ #cmakedefine HAVE_OPENCL +/* AMD's OpenCL Fast Fourier Transform Library*/ +#cmakedefine HAVE_CLAMDFFT + +/* AMD's Basic Linear Algebra Subprograms Library*/ +#cmakedefine HAVE_CLAMDBLAS + /* NVidia Cuda Fast Fourier Transform (FFT) API*/ #cmakedefine HAVE_CUFFT diff --git a/modules/contrib/src/stereovar.cpp b/modules/contrib/src/stereovar.cpp index 88640d86b2..1b542bbf52 100755 --- a/modules/contrib/src/stereovar.cpp +++ b/modules/contrib/src/stereovar.cpp @@ -67,11 +67,12 @@ StereoVar::~StereoVar() static Mat diffX(Mat &src) { - register int x, y, cols = src.cols - 1; + int cols = src.cols - 1; Mat dst(src.size(), src.type()); - for(y = 0; y < src.rows; y++){ + for(int y = 0; y < src.rows; y++){ const float* pSrc = src.ptr(y); float* pDst = dst.ptr(y); + int x = 0; #if CV_SSE2 for (x = 0; x <= cols - 8; x += 8) { __m128 a0 = _mm_loadu_ps(pSrc + x); diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp index f496de3d51..fe496762ed 100644 --- a/modules/features2d/src/fast.cpp +++ b/modules/features2d/src/fast.cpp @@ -9,16 +9,16 @@ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - *Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. + *Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. - *Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. + *Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. - *Neither the name of the University of Cambridge nor the names of - its contributors may be used to endorse or promote products derived - from this software without specific prior written permission. + *Neither the name of the University of Cambridge nor the names of + its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT @@ -350,7 +350,7 @@ int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold) } int b0 = -a0; - for( k = 0; k < 12; k += 2 ) + for( k = 0; k < 8; k += 2 ) { int b = std::max((int)d[k+1], (int)d[k+2]); b = std::max(b, (int)d[k+3]); @@ -375,7 +375,10 @@ template void FAST_t(InputArray _img, std::vector& keypoints, int threshold, bool nonmax_suppression) { Mat img = _img.getMat(); - const int K = patternSize/2, N = patternSize + K + 1, quarterPatternSize = patternSize/4; + const int K = patternSize/2, N = patternSize + K + 1; +#if CV_SSE2 + const int quarterPatternSize = patternSize/4; +#endif int i, j, k, pixel[25]; makeOffsets(pixel, (int)img.step, patternSize); for(k = patternSize; k < 25; k++) @@ -585,7 +588,7 @@ FastFeatureDetector::FastFeatureDetector( int _threshold, bool _nonmaxSuppressio FastFeatureDetector::FastFeatureDetector( int _threshold, bool _nonmaxSuppression, int _type ) : threshold(_threshold), nonmaxSuppression(_nonmaxSuppression), type(_type) {} - + void FastFeatureDetector::detectImpl( const Mat& image, vector& keypoints, const Mat& mask ) const { Mat grayImage = image; diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 2165673c84..91a004a866 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -1298,17 +1298,17 @@ public: maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight) { } - + virtual void operator() (const Range& range) const { int i, j, cn = dest->channels(), k; Size size = dest->size(); - + for( i = range.start; i < range.end; i++ ) { const uchar* sptr = temp->ptr(i+radius) + radius*cn; uchar* dptr = dest->ptr(i); - + if( cn == 1 ) { for( j = 0; j < size.width; j++ ) @@ -1351,10 +1351,10 @@ public: } } } - + private: - const Mat *temp; Mat *dest; + const Mat *temp; int radius, maxk, *space_ofs; float *space_weight, *color_weight; }; @@ -1367,40 +1367,40 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d, int cn = src.channels(); int i, j, maxk, radius; Size size = src.size(); - + CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.type() == dst.type() && src.size() == dst.size() && src.data != dst.data ); - + if( sigma_color <= 0 ) sigma_color = 1; if( sigma_space <= 0 ) sigma_space = 1; - + double gauss_color_coeff = -0.5/(sigma_color*sigma_color); double gauss_space_coeff = -0.5/(sigma_space*sigma_space); - + if( d <= 0 ) radius = cvRound(sigma_space*1.5); else radius = d/2; radius = MAX(radius, 1); d = radius*2 + 1; - + Mat temp; copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); - + vector _color_weight(cn*256); vector _space_weight(d*d); vector _space_ofs(d*d); float* color_weight = &_color_weight[0]; float* space_weight = &_space_weight[0]; int* space_ofs = &_space_ofs[0]; - + // initialize color-related bilateral filter coefficients for( i = 0; i < 256*cn; i++ ) color_weight[i] = (float)std::exp(i*i*gauss_color_coeff); - + // initialize space-related bilateral filter coefficients for( i = -radius, maxk = 0; i <= radius; i++ ) for( j = -radius; j <= radius; j++ ) @@ -1411,7 +1411,7 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d, space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); space_ofs[maxk++] = (int)(i*temp.step + j*cn); } - + BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight); parallel_for_(Range(0, size.height), body); } diff --git a/modules/ocl/CMakeLists.txt b/modules/ocl/CMakeLists.txt index b331e9f1e6..24595770df 100644 --- a/modules/ocl/CMakeLists.txt +++ b/modules/ocl/CMakeLists.txt @@ -29,6 +29,14 @@ if (HAVE_OPENCL) if(OPENCL_INCLUDE_DIR) ocv_include_directories(${OPENCL_INCLUDE_DIR}) endif() + if (HAVE_CLAMDFFT) + set(ocl_link_libs ${ocl_link_libs} ${CLAMDFFT_LIBRARIES}) + ocv_include_directories(${CLAMDFFT_INCLUDE_DIR}) + endif() + if (HAVE_CLAMDBLAS) + set(ocl_link_libs ${ocl_link_libs} ${CLAMDBLAS_LIBRARIES}) + ocv_include_directories(${CLAMDBLAS_INCLUDE_DIR}) + endif() endif() ocv_set_module_sources( diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index 0efc72283a..3bc6729e48 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -858,7 +858,72 @@ namespace cv void benchmark_copy_vectorize(const oclMat &src, oclMat &dst); void benchmark_copy_offset_stride(const oclMat &src, oclMat &dst); void benchmark_ILP(); - + + //! computes vertical sum, supports only CV_32FC1 images + CV_EXPORTS void columnSum(const oclMat& src, oclMat& sum); + + //! performs linear blending of two images + //! to avoid accuracy errors sum of weigths shouldn't be very close to zero + // supports only CV_8UC1 source type + CV_EXPORTS void blendLinear(const oclMat& img1, const oclMat& img2, const oclMat& weights1, const oclMat& weights2, oclMat& result); + + /////////////////////////////// Pyramid ///////////////////////////////////// + CV_EXPORTS void pyrDown(const oclMat& src, oclMat& dst); + + //! upsamples the source image and then smoothes it + CV_EXPORTS void pyrUp(const cv::ocl::oclMat& src,cv::ocl::oclMat& dst); + + ///////////////////////////////////////// match_template ///////////////////////////////////////////////////////////// + struct CV_EXPORTS MatchTemplateBuf + { + Size user_block_size; + oclMat imagef, templf; + std::vector images; + std::vector image_sums; + std::vector image_sqsums; + }; + + + //! computes the proximity map for the raster template and the image where the template is searched for + // Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4 + // Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4 + CV_EXPORTS void matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method); + + //! computes the proximity map for the raster template and the image where the template is searched for + // Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4 + // Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4 + CV_EXPORTS void matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method, MatchTemplateBuf& buf); + +#ifdef HAVE_CLAMDFFT + ///////////////////////////////////////// clAmdFft related ///////////////////////////////////////// + // the two functions must be called before/after run any fft library functions. + CV_EXPORTS void fft_setup(); // this will be implicitly invoked + CV_EXPORTS void fft_teardown(); // you need to teardown fft library manually + + /////////////////////////////////////// DFT ///////////////////////////////////////////////////// + //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix. + //! Param dft_size is the size of DFT transform. + //! + //! For complex-to-real transform it is assumed that the source matrix is packed in CLFFT's format. + // support src type of CV32FC1, CV32FC2 + // support flags: DFT_INVERSE, DFT_REAL_OUTPUT, DFT_COMPLEX_OUTPUT, DFT_ROWS + // dft_size is the size of original input, which is used for transformation from complex to real. + // dft_size must be powers of 2, 3 and 5 + // real to complex dft requires at least v1.8 clAmdFft + // real to complex dft output is not the same with cpu version + // real to complex and complex to real does not support DFT_ROWS + CV_EXPORTS void dft(const oclMat& src, oclMat& dst, Size dft_size = Size(0, 0), int flags = 0); +#endif // HAVE_CLAMDFFT + +#ifdef HAVE_CLAMDBLAS + //! implements generalized matrix product algorithm GEMM from BLAS + // The functionality requires clAmdBlas library + // only support type CV_32FC1 + // flag GEMM_3_T is not supported + CV_EXPORTS void gemm(const oclMat& src1, const oclMat& src2, double alpha, + const oclMat& src3, double beta, oclMat& dst, int flags = 0); +#endif + } } #include "opencv2/ocl/matrix_operations.hpp" diff --git a/modules/ocl/src/blend.cpp b/modules/ocl/src/blend.cpp new file mode 100644 index 0000000000..a9df907d3c --- /dev/null +++ b/modules/ocl/src/blend.cpp @@ -0,0 +1,98 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Nathan, liujun@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" +#include + +using namespace cv; +using namespace cv::ocl; +using namespace std; + +#if !defined (HAVE_OPENCL) +void cv::ocl::blendLinear(const oclMat& img1, const oclMat& img2, const oclMat& weights1, const oclMat& weights2, + oclMat& result){throw_nogpu();} +#else +namespace cv +{ + namespace ocl + { + ////////////////////////////////////OpenCL kernel strings////////////////////////// + extern const char *blend_linear; + } +} + +void cv::ocl::blendLinear(const oclMat& img1, const oclMat& img2, const oclMat& weights1, const oclMat& weights2, + oclMat& result) +{ + cv::ocl::Context *ctx = img1.clCxt; + assert(ctx == img2.clCxt && ctx == weights1.clCxt && ctx == weights2.clCxt); + int channels = img1.channels(); + int depth = img1.depth(); + int rows = img1.rows; + int cols = img1.cols; + int istep = img1.step; + int wstep = weights1.step; + size_t globalSize[] = {cols * channels, rows, 1}; + size_t localSize[] = {16, 16, 1}; + + vector< pair > args; + + if(globalSize[0]!=0) + { + args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&img1.data )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&img2.data )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&weights1.data )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&weights2.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&rows )); + args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); + args.push_back( make_pair( sizeof(cl_int), (void *)&istep )); + args.push_back( make_pair( sizeof(cl_int), (void *)&wstep )); + std::string kernelName = "BlendLinear"; + + openCLExecuteKernel(ctx, &blend_linear, kernelName, globalSize, localSize, args, channels, depth); + } +} +#endif \ No newline at end of file diff --git a/modules/ocl/src/columnsum.cpp b/modules/ocl/src/columnsum.cpp new file mode 100644 index 0000000000..e789d38b09 --- /dev/null +++ b/modules/ocl/src/columnsum.cpp @@ -0,0 +1,91 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Chunpeng Zhang, chunpeng@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include +#include "precomp.hpp" + +using namespace cv; +using namespace cv::ocl; +using namespace std; + + +#if !defined(HAVE_OPENCL) + +void cv::ocl::columnSum(const oclMat& src,oclMat& dst){ throw_nogpu(); } + +#else /*!HAVE_OPENCL */ + +namespace cv +{ + namespace ocl + { + extern const char* imgproc_columnsum; + } +} + +void cv::ocl::columnSum(const oclMat& src,oclMat& dst) +{ + CV_Assert(src.type() == CV_32FC1 && dst.type() == CV_32FC1 && src.size() == dst.size()); + + Context *clCxt = src.clCxt; + + const std::string kernelName = "columnSum"; + + std::vector< pair > args; + + args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.step)); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step)); + + size_t globalThreads[3] = {dst.cols, dst.rows, 1}; + size_t localThreads[3] = {16, 16, 1}; + + openCLExecuteKernel(clCxt, &imgproc_columnsum, kernelName, globalThreads, localThreads, args, src.channels(), src.depth()); + +} +#endif \ No newline at end of file diff --git a/modules/ocl/src/fft.cpp b/modules/ocl/src/fft.cpp new file mode 100644 index 0000000000..b3eda35c18 --- /dev/null +++ b/modules/ocl/src/fft.cpp @@ -0,0 +1,302 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Peng Xiao, pengxiao@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ +#include +#include "precomp.hpp" + +#ifdef HAVE_CLAMDFFT + +using namespace cv; +using namespace cv::ocl; +using namespace std; + +#if !defined (HAVE_OPENCL) +void cv::ocl::dft(const oclMat& src, oclMat& dst, int flags) { throw_nogpu(); } +#else + +#include + +namespace cv{ namespace ocl { + enum FftType + { + C2R = 1, // complex to complex + R2C = 2, // real to opencl HERMITIAN_INTERLEAVED + C2C = 3 // opencl HERMITIAN_INTERLEAVED to real + }; + struct FftPlan + { + friend void fft_setup(); + friend void fft_teardown(); + ~FftPlan(); + protected: + FftPlan(Size _dft_size, int _src_step, int _dst_step, int _flags, FftType _type); + const Size dft_size; + const int src_step, dst_step; + const int flags; + const FftType type; + clAmdFftPlanHandle plHandle; + static vector planStore; + static bool started; + static clAmdFftSetupData * setupData; + public: + // return a baked plan-> + // if there is one matched plan, return it + // if not, bake a new one, put it into the planStore and return it. + static clAmdFftPlanHandle getPlan(Size _dft_size, int _src_step, int _dst_step, int _flags, FftType _type); + }; +}} +bool cv::ocl::FftPlan::started = false; +vector cv::ocl::FftPlan::planStore = vector(); +clAmdFftSetupData * cv::ocl::FftPlan::setupData = 0; + +void cv::ocl::fft_setup() +{ + if(FftPlan::started) + { + return; + } + FftPlan::setupData = new clAmdFftSetupData; + openCLSafeCall(clAmdFftInitSetupData( FftPlan::setupData )); + FftPlan::started = true; +} +void cv::ocl::fft_teardown() +{ + if(!FftPlan::started) + { + return; + } + delete FftPlan::setupData; + for(int i = 0; i < FftPlan::planStore.size(); i ++) + { + delete FftPlan::planStore[i]; + } + FftPlan::planStore.clear(); + openCLSafeCall( clAmdFftTeardown( ) ); + FftPlan::started = false; +} + +// bake a new plan +cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _flags, FftType _type) + : dft_size(_dft_size), src_step(_src_step), dst_step(_dst_step), flags(_flags), type(_type), plHandle(0) +{ + if(!FftPlan::started) + { + // implicitly do fft setup + fft_setup(); + } + + bool is_1d_input = (_dft_size.height == 1); + int is_row_dft = flags & DFT_ROWS; + int is_scaled_dft = flags & DFT_SCALE; + int is_inverse = flags & DFT_INVERSE; + + clAmdFftResultLocation place; + clAmdFftLayout inLayout; + clAmdFftLayout outLayout; + clAmdFftDim dim = is_1d_input||is_row_dft ? CLFFT_1D : CLFFT_2D; + + size_t batchSize = is_row_dft?dft_size.height : 1; + size_t clLengthsIn[ 3 ] = {1, 1, 1}; + size_t clStridesIn[ 3 ] = {1, 1, 1}; + size_t clLengthsOut[ 3 ] = {1, 1, 1}; + size_t clStridesOut[ 3 ] = {1, 1, 1}; + clLengthsIn[0] = dft_size.width; + clLengthsIn[1] = is_row_dft ? 1 : dft_size.height; + clStridesIn[0] = 1; + clStridesOut[0] = 1; + + switch(_type) + { + case C2C: + inLayout = CLFFT_COMPLEX_INTERLEAVED; + outLayout = CLFFT_COMPLEX_INTERLEAVED; + clStridesIn[1] = src_step / sizeof(std::complex); + clStridesOut[1] = clStridesIn[1]; + break; + case R2C: + CV_Assert(!is_row_dft); // this is not supported yet + inLayout = CLFFT_REAL; + outLayout = CLFFT_HERMITIAN_INTERLEAVED; + clStridesIn[1] = src_step / sizeof(float); + clStridesOut[1] = dst_step / sizeof(std::complex); + break; + case C2R: + CV_Assert(!is_row_dft); // this is not supported yet + inLayout = CLFFT_HERMITIAN_INTERLEAVED; + outLayout = CLFFT_REAL; + clStridesIn[1] = src_step / sizeof(std::complex); + clStridesOut[1] = dst_step / sizeof(float); + break; + default: + //std::runtime_error("does not support this convertion!"); + cout << "Does not support this convertion!" << endl; + throw exception(); + break; + } + + clStridesIn[2] = is_row_dft ? clStridesIn[1] : dft_size.width * clStridesIn[1]; + clStridesOut[2] = is_row_dft ? clStridesOut[1] : dft_size.width * clStridesOut[1]; + + openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, Context::getContext()->impl->clContext, dim, clLengthsIn ) ); + + openCLSafeCall( clAmdFftSetResultLocation( plHandle, CLFFT_OUTOFPLACE ) ); + openCLSafeCall( clAmdFftSetLayout( plHandle, inLayout, outLayout ) ); + openCLSafeCall( clAmdFftSetPlanBatchSize( plHandle, batchSize ) ); + + openCLSafeCall( clAmdFftSetPlanInStride ( plHandle, dim, clStridesIn ) ); + openCLSafeCall( clAmdFftSetPlanOutStride ( plHandle, dim, clStridesOut ) ); + openCLSafeCall( clAmdFftSetPlanDistance ( plHandle, clStridesIn[ dim ], clStridesIn[ dim ]) ); + openCLSafeCall( clAmdFftBakePlan( plHandle, 1, &(Context::getContext()->impl->clCmdQueue), NULL, NULL ) ); +} +cv::ocl::FftPlan::~FftPlan() +{ + for(int i = 0; i < planStore.size(); i ++) + { + if(planStore[i]->plHandle == plHandle) + { + planStore.erase(planStore.begin()+ i); + } + } + openCLSafeCall( clAmdFftDestroyPlan( &plHandle ) ); +} + +clAmdFftPlanHandle cv::ocl::FftPlan::getPlan(Size _dft_size, int _src_step, int _dst_step, int _flags, FftType _type) +{ + // go through search + for(int i = 0; i < planStore.size(); i ++) + { + FftPlan * plan = planStore[i]; + if( + plan->dft_size.width == _dft_size.width && + plan->dft_size.height == _dft_size.height && + plan->flags == _flags && + plan->src_step == _src_step && + plan->dst_step == _dst_step && + plan->type == _type + ) + { + return plan->plHandle; + } + } + // no baked plan is found + FftPlan *newPlan = new FftPlan(_dft_size, _src_step, _dst_step, _flags, _type); + planStore.push_back(newPlan); + return newPlan->plHandle; +} + +void cv::ocl::dft(const oclMat& src, oclMat& dst, Size dft_size, int flags) +{ + if(dft_size == Size(0,0)) + { + dft_size = src.size(); + } + // check if the given dft size is of optimal dft size + CV_Assert(dft_size.area() == getOptimalDFTSize(dft_size.area())); + + // similar assertions with cuda module + CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2); + + // we don't support DFT_SCALE flag + CV_Assert(!(DFT_SCALE & flags)); + + bool is_1d_input = (src.rows == 1); + int is_row_dft = flags & DFT_ROWS; + int is_scaled_dft = flags & DFT_SCALE; + int is_inverse = flags & DFT_INVERSE; + bool is_complex_input = src.channels() == 2; + bool is_complex_output = !(flags & DFT_REAL_OUTPUT); + + // We don't support real-to-real transform + CV_Assert(is_complex_input || is_complex_output); + FftType type = (FftType)(is_complex_input << 0 | is_complex_output << 1); + + switch(type) + { + case C2C: + dst.create(src.rows, src.cols, CV_32FC2); + break; + case R2C: + CV_Assert(!is_row_dft); // this is not supported yet + dst.create(src.rows, src.cols/2 + 1, CV_32FC2); + break; + case C2R: + CV_Assert(dft_size.width / 2 + 1 == src.cols && dft_size.height == src.rows); + CV_Assert(!is_row_dft); // this is not supported yet + dst.create(src.rows, dft_size.width, CV_32FC1); + break; + default: + //std::runtime_error("does not support this convertion!"); + cout << "Does not support this convertion!" << endl; + throw exception(); + break; + } + clAmdFftPlanHandle plHandle = FftPlan::getPlan(dft_size, src.step, dst.step, flags, type); + + //get the buffersize + size_t buffersize=0; + openCLSafeCall( clAmdFftGetTmpBufSize(plHandle, &buffersize ) ); + + //allocate the intermediate buffer + cl_mem clMedBuffer=NULL; + if (buffersize) + { + cl_int medstatus; + clMedBuffer = clCreateBuffer ( src.clCxt->impl->clContext, CL_MEM_READ_WRITE, buffersize, 0, &medstatus); + openCLSafeCall( medstatus ); + } + openCLSafeCall( clAmdFftEnqueueTransform( plHandle, + is_inverse?CLFFT_BACKWARD:CLFFT_FORWARD, + 1, + &src.clCxt->impl->clCmdQueue, + 0, NULL, NULL, + (cl_mem*)&src.data, (cl_mem*)&dst.data, clMedBuffer ) ); + openCLSafeCall( clFinish(src.clCxt->impl->clCmdQueue) ); + if(clMedBuffer) + { + openCLFree(clMedBuffer); + } +} + +#endif +#endif //HAVE_CLAMDFFT diff --git a/modules/ocl/src/gemm.cpp b/modules/ocl/src/gemm.cpp new file mode 100644 index 0000000000..c35e061826 --- /dev/null +++ b/modules/ocl/src/gemm.cpp @@ -0,0 +1,161 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Peng Xiao, pengxiao@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include +#include "precomp.hpp" + +#ifdef HAVE_CLAMDBLAS + +#include "clAmdBlas.h" + +#if !defined (HAVE_OPENCL) +void cv::ocl::dft(const oclMat& src, oclMat& dst, int flags) { throw_nogpu(); } +#else + +using namespace cv; + + void cv::ocl::gemm(const oclMat& src1, const oclMat& src2, double alpha, + const oclMat& src3, double beta, oclMat& dst, int flags) + { + CV_Assert(src1.cols == src2.rows && + (src3.empty() || src1.rows == src3.rows && src2.cols == src3.cols)); + CV_Assert(!(cv::GEMM_3_T & flags)); // cv::GEMM_3_T is not supported + if(!src3.empty()) + { + src3.copyTo(dst); + } + else + { + dst.create(src1.rows, src2.cols, src1.type()); + dst.setTo(Scalar::all(0)); + } + openCLSafeCall( clAmdBlasSetup() ); + + const clAmdBlasTranspose transA = (cv::GEMM_1_T & flags)?clAmdBlasTrans:clAmdBlasNoTrans; + const clAmdBlasTranspose transB = (cv::GEMM_2_T & flags)?clAmdBlasTrans:clAmdBlasNoTrans; + const clAmdBlasOrder order = clAmdBlasRowMajor; + + const int M = src1.rows; + const int N = src2.cols; + const int K = src1.cols; + int lda = src1.step; + int ldb = src2.step; + int ldc = dst.step; + int offa = src1.offset; + int offb = src2.offset; + int offc = dst.offset; + + + switch(src1.type()) + { + case CV_32FC1: + lda /= sizeof(float); + ldb /= sizeof(float); + ldc /= sizeof(float); + offa /= sizeof(float); + offb /= sizeof(float); + offc /= sizeof(float); + openCLSafeCall + ( + clAmdBlasSgemmEx(order, transA, transB, M, N, K, + alpha, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb, + beta, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL) + ); + break; + case CV_64FC1: + lda /= sizeof(double); + ldb /= sizeof(double); + ldc /= sizeof(double); + offa /= sizeof(double); + offb /= sizeof(double); + offc /= sizeof(double); + openCLSafeCall + ( + clAmdBlasDgemmEx(order, transA, transB, M, N, K, + alpha, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb, + beta, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL) + ); + break; + case CV_32FC2: + { + lda /= sizeof(std::complex); + ldb /= sizeof(std::complex); + ldc /= sizeof(std::complex); + offa /= sizeof(std::complex); + offb /= sizeof(std::complex); + offc /= sizeof(std::complex); + cl_float2 alpha_2 = {{alpha, 0}}; + cl_float2 beta_2 = {{beta, 0}}; + openCLSafeCall + ( + clAmdBlasCgemmEx(order, transA, transB, M, N, K, + alpha_2, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb, + beta_2, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL) + ); + } + break; + case CV_64FC2: + { + lda /= sizeof(std::complex); + ldb /= sizeof(std::complex); + ldc /= sizeof(std::complex); + offa /= sizeof(std::complex); + offb /= sizeof(std::complex); + offc /= sizeof(std::complex); + cl_double2 alpha_2 = {{alpha, 0}}; + cl_double2 beta_2 = {{beta, 0}}; + openCLSafeCall + ( + clAmdBlasZgemmEx(order, transA, transB, M, N, K, + alpha_2, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb, + beta_2, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL) + ); + } + break; + } + clAmdBlasTeardown(); + } +#endif +#endif diff --git a/modules/ocl/src/kernels/blend_linear.cl b/modules/ocl/src/kernels/blend_linear.cl new file mode 100644 index 0000000000..bf733576c0 --- /dev/null +++ b/modules/ocl/src/kernels/blend_linear.cl @@ -0,0 +1,196 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, MulticoreWare Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Liu Liujun, liujun@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other GpuMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ +__kernel void BlendLinear_C1_D0( + __global uchar *dst, + __global uchar *img1, + __global uchar *img2, + __global float *weight1, + __global float *weight2, + int rows, + int cols, + int istep, + int wstep + ) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + if (idx < cols && idy < rows) + { + int pos = idy * istep + idx; + int wpos = idy * (wstep /sizeof(float)) + idx; + float w1 = weight1[wpos]; + float w2 = weight2[wpos]; + dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f); + + } +} + +__kernel void BlendLinear_C3_D0( + __global uchar *dst, + __global uchar *img1, + __global uchar *img2, + __global float *weight1, + __global float *weight2, + int rows, + int cols, + int istep, + int wstep + ) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int x = idx / 3; + int y = idy; + if (x < cols && y < rows) + { + int pos = idy * istep + idx; + int wpos = idy * (wstep /sizeof(float)) + x; + float w1 = weight1[wpos]; + float w2 = weight2[wpos]; + dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f); + } +} + +__kernel void BlendLinear_C4_D0( + __global uchar *dst, + __global uchar *img1, + __global uchar *img2, + __global float *weight1, + __global float *weight2, + int rows, + int cols, + int istep, + int wstep + ) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int x = idx / 4; + int y = idy; + if (x < cols && y < rows) + { + int pos = idy * istep + idx; + int wpos = idy * (wstep /sizeof(float)) + x; + float w1 = weight1[wpos]; + float w2 = weight2[wpos]; + dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f); + } +} + +__kernel void BlendLinear_C1_D5( + __global float *dst, + __global float *img1, + __global float *img2, + __global float *weight1, + __global float *weight2, + int rows, + int cols, + int istep, + int wstep + ) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + if (idx < cols && idy < rows) + { + int pos = idy * (istep / sizeof(float)) + idx; + int wpos = idy * (wstep /sizeof(float)) + idx; + float w1 = weight1[wpos]; + float w2 = weight2[wpos]; + dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f); + } +} + +__kernel void BlendLinear_C3_D5( + __global float *dst, + __global float *img1, + __global float *img2, + __global float *weight1, + __global float *weight2, + int rows, + int cols, + int istep, + int wstep + ) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int x = idx / 3; + int y = idy; + if (x < cols && y < rows) + { + int pos = idy * (istep / sizeof(float)) + idx; + int wpos = idy * (wstep /sizeof(float)) + x; + float w1 = weight1[wpos]; + float w2 = weight2[wpos]; + dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f); + } +} + +__kernel void BlendLinear_C4_D5( + __global float *dst, + __global float *img1, + __global float *img2, + __global float *weight1, + __global float *weight2, + int rows, + int cols, + int istep, + int wstep + ) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int x = idx / 4; + int y = idy; + if (x < cols && y < rows) + { + int pos = idy * (istep / sizeof(float)) + idx; + int wpos = idy * (wstep /sizeof(float)) + x; + float w1 = weight1[wpos]; + float w2 = weight2[wpos]; + dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f); + } +} diff --git a/modules/ocl/src/kernels/imgproc_columnsum.cl b/modules/ocl/src/kernels/imgproc_columnsum.cl new file mode 100644 index 0000000000..913b417d15 --- /dev/null +++ b/modules/ocl/src/kernels/imgproc_columnsum.cl @@ -0,0 +1,80 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Chunpeng Zhang chunpeng@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#if defined (__ATI__) +#pragma OPENCL EXTENSION cl_amd_fp64:enable + +#elif defined (__NVIDIA__) +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#endif + +//////////////////////////////////////////////////////////////////// +///////////////////////// columnSum //////////////////////////////// +//////////////////////////////////////////////////////////////////// +/// CV_32FC1 +__kernel void columnSum_C1_D5(__global float* src,__global float* dst,int srcCols,int srcRows,int srcStep,int dstStep) +{ + const int x = get_global_id(0); + + srcStep >>= 2; + dstStep >>= 2; + + if (x < srcCols) + { + int srcIdx = x ; + int dstIdx = x ; + + float sum = 0; + + for (int y = 0; y < srcRows; ++y) + { + sum += src[srcIdx]; + dst[dstIdx] = sum; + srcIdx += srcStep; + dstIdx += dstStep; + } + } +} diff --git a/modules/ocl/src/kernels/match_template.cl b/modules/ocl/src/kernels/match_template.cl new file mode 100644 index 0000000000..4c5a4fc9ca --- /dev/null +++ b/modules/ocl/src/kernels/match_template.cl @@ -0,0 +1,824 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Peng Xiao, pengxiao@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#pragma OPENCL EXTENSION cl_amd_printf : enable + +#if defined (__ATI__) +#pragma OPENCL EXTENSION cl_amd_fp64:enable + +#elif defined (__NVIDIA__) +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#endif + +#if !defined(USE_SQR_INTEGRAL) && (defined (__ATI__) || defined (__NVIDIA__)) +#define TYPE_IMAGE_SQSUM double +#else +#define TYPE_IMAGE_SQSUM ulong +#endif + +////////////////////////////////////////////////// +// utilities +#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, gidx + img_sqsums_offset + ox) +#define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox) +// normAcc* are accurate normalization routines which make GPU matchTemplate +// consistent with CPU one +float normAcc(float num, float denum) +{ + if(fabs(num) < denum) + { + return num / denum; + } + if(fabs(num) < denum * 1.125f) + { + return num > 0 ? 1 : -1; + } + return 0; +} + +float normAcc_SQDIFF(float num, float denum) +{ + if(fabs(num) < denum) + { + return num / denum; + } + if(fabs(num) < denum * 1.125f) + { + return num > 0 ? 1 : -1; + } + return 1; +} +////////////////////////////////////////////////////////////////////// +// normalize + +__kernel +void normalizeKernel_C1_D0 +( + __global const TYPE_IMAGE_SQSUM * img_sqsums, + __global float * res, + ulong tpl_sqsum, + int res_rows, + int res_cols, + int tpl_rows, + int tpl_cols, + int img_sqsums_offset, + int img_sqsums_step, + int res_offset, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + img_sqsums_step /= sizeof(*img_sqsums); + img_sqsums_offset /= sizeof(*img_sqsums); + int res_idx = mad24(gidy, res_step, res_offset + gidx); + if(gidx < res_cols && gidy < res_rows) + { + float image_sqsum_ = (float)( + (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); + res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum)); + } +} + +__kernel +void matchTemplate_Prepared_SQDIFF_C1_D0 +( + __global const TYPE_IMAGE_SQSUM * img_sqsums, + __global float * res, + ulong tpl_sqsum, + int res_rows, + int res_cols, + int tpl_rows, + int tpl_cols, + int img_sqsums_offset, + int img_sqsums_step, + int res_offset, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + img_sqsums_step /= sizeof(*img_sqsums); + img_sqsums_offset /= sizeof(*img_sqsums); + int res_idx = mad24(gidy, res_step, res_offset + gidx); + if(gidx < res_cols && gidy < res_rows) + { + float image_sqsum_ = (float)( + (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); + res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum; + } +} + +__kernel +void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0 +( + __global const TYPE_IMAGE_SQSUM * img_sqsums, + __global float * res, + ulong tpl_sqsum, + int res_rows, + int res_cols, + int tpl_rows, + int tpl_cols, + int img_sqsums_offset, + int img_sqsums_step, + int res_offset, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + img_sqsums_step /= sizeof(*img_sqsums); + img_sqsums_offset /= sizeof(*img_sqsums); + int res_idx = mad24(gidy, res_step, res_offset + gidx); + if(gidx < res_cols && gidy < res_rows) + { + float image_sqsum_ = (float)( + (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); + res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum, + sqrt(image_sqsum_ * tpl_sqsum)); + } +} + +////////////////////////////////////////////////// +// SQDIFF +__kernel +void matchTemplate_Naive_SQDIFF_C1_D0 +( + __global const uchar * img, + __global const uchar * tpl, + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int img_offset, + int tpl_offset, + int res_offset, + int img_step, + int tpl_step, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int i,j; + int delta; + int sum = 0; + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + for(i = 0; i < tpl_rows; i ++) + { + // get specific rows of img data + __global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); + __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); + for(j = 0; j < tpl_cols; j ++) + { + delta = img_ptr[j] - tpl_ptr[j]; + sum = mad24(delta, delta, sum); + } + } + res[res_idx] = sum; + } +} + +__kernel +void matchTemplate_Naive_SQDIFF_C1_D5 +( + __global const float * img, + __global const float * tpl, + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int img_offset, + int tpl_offset, + int res_offset, + int img_step, + int tpl_step, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int i,j; + float delta; + float sum = 0; + img_step /= sizeof(*img); + img_offset /= sizeof(*img); + tpl_step /= sizeof(*tpl); + tpl_offset /= sizeof(*tpl); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + for(i = 0; i < tpl_rows; i ++) + { + // get specific rows of img data + __global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); + __global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); + for(j = 0; j < tpl_cols; j ++) + { + delta = img_ptr[j] - tpl_ptr[j]; + sum = mad(delta, delta, sum); + } + } + res[res_idx] = sum; + } +} + +__kernel +void matchTemplate_Naive_SQDIFF_C4_D0 +( + __global const uchar4 * img, + __global const uchar4 * tpl, + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int img_offset, + int tpl_offset, + int res_offset, + int img_step, + int tpl_step, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int i,j; + int4 delta; + int4 sum = (int4)(0, 0, 0, 0); + img_step /= sizeof(*img); + img_offset /= sizeof(*img); + tpl_step /= sizeof(*tpl); + tpl_offset /= sizeof(*tpl); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + for(i = 0; i < tpl_rows; i ++) + { + // get specific rows of img data + __global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); + __global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); + for(j = 0; j < tpl_cols; j ++) + { + //delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect + delta.x = img_ptr[j].x - tpl_ptr[j].x; + delta.y = img_ptr[j].y - tpl_ptr[j].y; + delta.z = img_ptr[j].z - tpl_ptr[j].z; + delta.w = img_ptr[j].w - tpl_ptr[j].w; + sum = mad24(delta, delta, sum); + } + } + res[res_idx] = sum.x + sum.y + sum.z + sum.w; + } +} + +__kernel +void matchTemplate_Naive_SQDIFF_C4_D5 +( + __global const float4 * img, + __global const float4 * tpl, + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int img_offset, + int tpl_offset, + int res_offset, + int img_step, + int tpl_step, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int i,j; + float4 delta; + float4 sum = (float4)(0, 0, 0, 0); + img_step /= sizeof(*img); + img_offset /= sizeof(*img); + tpl_step /= sizeof(*tpl); + tpl_offset /= sizeof(*tpl); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + for(i = 0; i < tpl_rows; i ++) + { + // get specific rows of img data + __global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); + __global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); + for(j = 0; j < tpl_cols; j ++) + { + //delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect + delta.x = img_ptr[j].x - tpl_ptr[j].x; + delta.y = img_ptr[j].y - tpl_ptr[j].y; + delta.z = img_ptr[j].z - tpl_ptr[j].z; + delta.w = img_ptr[j].w - tpl_ptr[j].w; + sum = mad(delta, delta, sum); + } + } + res[res_idx] = sum.x + sum.y + sum.z + sum.w; + } +} + +////////////////////////////////////////////////// +// CCORR +__kernel +void matchTemplate_Naive_CCORR_C1_D0 +( + __global const uchar * img, + __global const uchar * tpl, + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int img_offset, + int tpl_offset, + int res_offset, + int img_step, + int tpl_step, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int i,j; + int sum = 0; + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + for(i = 0; i < tpl_rows; i ++) + { + // get specific rows of img data + __global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); + __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); + for(j = 0; j < tpl_cols; j ++) + { + sum = mad24(img_ptr[j], tpl_ptr[j], sum); + } + } + res[res_idx] = sum; + } +} + +__kernel +void matchTemplate_Naive_CCORR_C1_D5 +( + __global const float * img, + __global const float * tpl, + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int img_offset, + int tpl_offset, + int res_offset, + int img_step, + int tpl_step, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int i,j; + float sum = 0; + img_step /= sizeof(*img); + img_offset /= sizeof(*img); + tpl_step /= sizeof(*tpl); + tpl_offset /= sizeof(*tpl); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + for(i = 0; i < tpl_rows; i ++) + { + // get specific rows of img data + __global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); + __global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); + for(j = 0; j < tpl_cols; j ++) + { + sum = mad(img_ptr[j], tpl_ptr[j], sum); + } + } + res[res_idx] = sum; + } +} + +__kernel +void matchTemplate_Naive_CCORR_C4_D0 +( + __global const uchar4 * img, + __global const uchar4 * tpl, + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int img_offset, + int tpl_offset, + int res_offset, + int img_step, + int tpl_step, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int i,j; + int4 sum = (int4)(0, 0, 0, 0); + img_step /= sizeof(*img); + img_offset /= sizeof(*img); + tpl_step /= sizeof(*tpl); + tpl_offset /= sizeof(*tpl); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + for(i = 0; i < tpl_rows; i ++) + { + // get specific rows of img data + __global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); + __global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); + for(j = 0; j < tpl_cols; j ++) + { + sum = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum); + } + } + res[res_idx] = sum.x + sum.y + sum.z + sum.w; + } +} + +__kernel +void matchTemplate_Naive_CCORR_C4_D5 +( + __global const float4 * img, + __global const float4 * tpl, + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int img_offset, + int tpl_offset, + int res_offset, + int img_step, + int tpl_step, + int res_step +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int i,j; + float4 sum = (float4)(0, 0, 0, 0); + img_step /= sizeof(*img); + img_offset /= sizeof(*img); + tpl_step /= sizeof(*tpl); + tpl_offset /= sizeof(*tpl); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + for(i = 0; i < tpl_rows; i ++) + { + // get specific rows of img data + __global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); + __global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); + for(j = 0; j < tpl_cols; j ++) + { + sum = mad(convert_float4(img_ptr[j]), convert_float4(tpl_ptr[j]), sum); + } + } + res[res_idx] = sum.x + sum.y + sum.z + sum.w; + } +} + +////////////////////////////////////////////////// +// CCOFF +__kernel +void matchTemplate_Prepared_CCOFF_C1_D0 +( + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int res_offset, + int res_step, + __global const uint * img_sums, + int img_sums_offset, + int img_sums_step, + float tpl_sum +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + img_sums_offset /= sizeof(*img_sums); + img_sums_step /= sizeof(*img_sums); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + float sum = (float)( + (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) + - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); + res[res_idx] -= sum * tpl_sum; + } +} +__kernel +void matchTemplate_Prepared_CCOFF_C4_D0 +( + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int res_offset, + int res_step, + __global const uint * img_sums_c0, + __global const uint * img_sums_c1, + __global const uint * img_sums_c2, + __global const uint * img_sums_c3, + int img_sums_offset, + int img_sums_step, + float tpl_sum_c0, + float tpl_sum_c1, + float tpl_sum_c2, + float tpl_sum_c3 +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + img_sums_offset /= sizeof(*img_sums_c0); + img_sums_step /= sizeof(*img_sums_c0); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + float ccorr = res[res_idx]; + ccorr -= tpl_sum_c0*(float)( + (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); + ccorr -= tpl_sum_c1*(float)( + (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); + ccorr -= tpl_sum_c2*(float)( + (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); + ccorr -= tpl_sum_c3*(float)( + (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); + res[res_idx] = ccorr; + } +} + +__kernel +void matchTemplate_Prepared_CCOFF_NORMED_C1_D0 +( + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int res_offset, + int res_step, + float weight, + __global const uint * img_sums, + int img_sums_offset, + int img_sums_step, + __global const TYPE_IMAGE_SQSUM * img_sqsums, + int img_sqsums_offset, + int img_sqsums_step, + float tpl_sum, + float tpl_sqsum +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + img_sqsums_step /= sizeof(*img_sqsums); + img_sqsums_offset /= sizeof(*img_sqsums); + img_sums_offset /= sizeof(*img_sums); + img_sums_step /= sizeof(*img_sums); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + float image_sum_ = (float)( + (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) + - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); + + float image_sqsum_ = (float)( + (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); + res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum, + sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_))); + } +} +__kernel +void matchTemplate_Prepared_CCOFF_NORMED_C4_D0 +( + __global float * res, + int img_rows, + int img_cols, + int tpl_rows, + int tpl_cols, + int res_rows, + int res_cols, + int res_offset, + int res_step, + float weight, + __global const uint * img_sums_c0, + __global const uint * img_sums_c1, + __global const uint * img_sums_c2, + __global const uint * img_sums_c3, + int img_sums_offset, + int img_sums_step, + __global const TYPE_IMAGE_SQSUM * img_sqsums_c0, + __global const TYPE_IMAGE_SQSUM * img_sqsums_c1, + __global const TYPE_IMAGE_SQSUM * img_sqsums_c2, + __global const TYPE_IMAGE_SQSUM * img_sqsums_c3, + int img_sqsums_offset, + int img_sqsums_step, + float tpl_sum_c0, + float tpl_sum_c1, + float tpl_sum_c2, + float tpl_sum_c3, + float tpl_sqsum +) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + img_sqsums_step /= sizeof(*img_sqsums_c0); + img_sqsums_offset /= sizeof(*img_sqsums_c0); + img_sums_offset /= sizeof(*img_sums_c0); + img_sums_step /= sizeof(*img_sums_c0); + res_step /= sizeof(*res); + res_offset /= sizeof(*res); + + int res_idx = mad24(gidy, res_step, res_offset + gidx); + + if(gidx < res_cols && gidy < res_rows) + { + float image_sum_c0 = (float)( + (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); + float image_sum_c1 = (float)( + (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); + float image_sum_c2 = (float)( + (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); + float image_sum_c3 = (float)( + (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); + + float image_sqsum_c0 = (float)( + (img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)])); + float image_sqsum_c1 = (float)( + (img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)])); + float image_sqsum_c2 = (float)( + (img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)])); + float image_sqsum_c3 = (float)( + (img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)])); + + float num = res[res_idx] - + image_sum_c0 * tpl_sum_c0 - + image_sum_c1 * tpl_sum_c1 - + image_sum_c2 * tpl_sum_c2 - + image_sum_c3 * tpl_sum_c3; + float denum = sqrt( tpl_sqsum * ( + image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 + + image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 + + image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 + + image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3) + ); + res[res_idx] = normAcc(num, denum); + } +} + diff --git a/modules/ocl/src/kernels/pyr_down.cl b/modules/ocl/src/kernels/pyr_down.cl new file mode 100644 index 0000000000..38b4ec7c7f --- /dev/null +++ b/modules/ocl/src/kernels/pyr_down.cl @@ -0,0 +1,500 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Dachuan Zhao, dachuan@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#pragma OPENCL EXTENSION cl_amd_printf : enable + + +uchar round_uchar_uchar(uchar v) +{ + return v; +} + +uchar round_uchar_int(int v) +{ + return (uchar)((uint)v <= 255 ? v : v > 0 ? 255 : 0); +} + +uchar round_uchar_float(float v) +{ + if(v - convert_int_sat_rte(v) > 1e-6 || v - convert_int_sat_rte(v) < -1e-6) + { + if(((int)v + 1) - (v + 0.5f) < 1e-6 && ((int)v + 1) - (v + 0.5f) > -1e-6) + { + v = (int)v + 0.51f; + } + } + int iv = convert_int_sat_rte(v); + return round_uchar_int(iv); +} + +uchar4 round_uchar4_uchar4(uchar4 v) +{ + return v; +} + +uchar4 round_uchar4_int4(int4 v) +{ + uchar4 result; + result.x = (uchar)(v.x <= 255 ? v.x : v.x > 0 ? 255 : 0); + result.y = (uchar)(v.y <= 255 ? v.y : v.y > 0 ? 255 : 0); + result.z = (uchar)(v.z <= 255 ? v.z : v.z > 0 ? 255 : 0); + result.w = (uchar)(v.w <= 255 ? v.w : v.w > 0 ? 255 : 0); + return result; +} + +uchar4 round_uchar4_float4(float4 v) +{ + if(v.x - convert_int_sat_rte(v.x) > 1e-6 || v.x - convert_int_sat_rte(v.x) < -1e-6) + { + if(((int)(v.x) + 1) - (v.x + 0.5f) < 1e-6 && ((int)(v.x) + 1) - (v.x + 0.5f) > -1e-6) + { + v.x = (int)(v.x) + 0.51f; + } + } + if(v.y - convert_int_sat_rte(v.y) > 1e-6 || v.y - convert_int_sat_rte(v.y) < -1e-6) + { + if(((int)(v.y) + 1) - (v.y + 0.5f) < 1e-6 && ((int)(v.y) + 1) - (v.y + 0.5f) > -1e-6) + { + v.y = (int)(v.y) + 0.51f; + } + } + if(v.z - convert_int_sat_rte(v.z) > 1e-6 || v.z - convert_int_sat_rte(v.z) < -1e-6) + { + if(((int)(v.z) + 1) - (v.z + 0.5f) < 1e-6 && ((int)(v.z) + 1) - (v.z + 0.5f) > -1e-6) + { + v.z = (int)(v.z) + 0.51f; + } + } + if(v.w - convert_int_sat_rte(v.w) > 1e-6 || v.w - convert_int_sat_rte(v.w) < -1e-6) + { + if(((int)(v.w) + 1) - (v.w + 0.5f) < 1e-6 && ((int)(v.w) + 1) - (v.w + 0.5f) > -1e-6) + { + v.w = (int)(v.w) + 0.51f; + } + } + int4 iv = convert_int4_sat_rte(v); + return round_uchar4_int4(iv); +} + + + + +int idx_row_low(int y, int last_row) +{ + if(y < 0) + { + y = -y; + } + return y % (last_row + 1); +} + +int idx_row_high(int y, int last_row) +{ + int i; + int j; + if(last_row - y < 0) + { + i = (y - last_row); + } + else + { + i = (last_row - y); + } + if(last_row - i < 0) + { + j = i - last_row; + } + else + { + j = last_row - i; + } + return j % (last_row + 1); +} + +int idx_row(int y, int last_row) +{ + return idx_row_low(idx_row_high(y, last_row), last_row); +} + +int idx_col_low(int x, int last_col) +{ + if(x < 0) + { + x = -x; + } + return x % (last_col + 1); +} + +int idx_col_high(int x, int last_col) +{ + int i; + int j; + if(last_col - x < 0) + { + i = (x - last_col); + } + else + { + i = (last_col - x); + } + if(last_col - i < 0) + { + j = i - last_col; + } + else + { + j = last_col - i; + } + return j % (last_col + 1); +} + +int idx_col(int x, int last_col) +{ + return idx_col_low(idx_col_high(x, last_col), last_col); +} + +__kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global uchar *dst, int dstStep, int dstOffset, int dstCols) +{ + const int x = get_group_id(0) * get_local_size(0) + get_local_id(0); + const int y = get_group_id(1); + + __local float smem[256 + 4]; + + float sum; + + const int src_y = 2*y; + const int last_row = srcRows - 1; + const int last_col = srcCols - 1; + + sum = 0; + + sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(x, last_col)]); + sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(x, last_col)]); + sum = sum + 0.375f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(x, last_col)]); + sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(x, last_col)]); + sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(x, last_col)]); + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + sum = 0; + + sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(left_x, last_col)]); + sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(left_x, last_col)]); + sum = sum + 0.375f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(left_x, last_col)]); + sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(left_x, last_col)]); + sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(left_x, last_col)]); + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + sum = 0; + + sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(right_x, last_col)]); + sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(right_x, last_col)]); + sum = sum + 0.375f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(right_x, last_col)]); + sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(right_x, last_col)]); + sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(right_x, last_col)]); + + smem[4 + get_local_id(0)] = sum; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) < 128) + { + const int tid2 = get_local_id(0) * 2; + + sum = 0; + + sum = sum + 0.0625f * smem[2 + tid2 - 2]; + sum = sum + 0.25f * smem[2 + tid2 - 1]; + sum = sum + 0.375f * smem[2 + tid2 ]; + sum = sum + 0.25f * smem[2 + tid2 + 1]; + sum = sum + 0.0625f * smem[2 + tid2 + 2]; + + const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; + + if (dst_x < dstCols) + dst[y * dstStep + dst_x] = round_uchar_float(sum); + } +} + +__kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global uchar4 *dst, int dstStep, int dstOffset, int dstCols) +{ + const int x = get_group_id(0) * get_local_size(0) + get_local_id(0); + const int y = get_group_id(1); + + __local float4 smem[256 + 4]; + + float4 sum; + + const int src_y = 2*y; + const int last_row = srcRows - 1; + const int last_col = srcCols - 1; + + float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f); + float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f); + float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f); + + sum = 0; + + sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(x, last_col)])); + sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(x, last_col)])); + sum = sum + co1 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(x, last_col)])); + sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(x, last_col)])); + sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(x, last_col)])); + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + sum = 0; + + sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)])); + sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)])); + sum = sum + co1 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(left_x, last_col)])); + sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)])); + sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)])); + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + sum = 0; + + sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)])); + sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)])); + sum = sum + co1 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(right_x, last_col)])); + sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)])); + sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)])); + + smem[4 + get_local_id(0)] = sum; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) < 128) + { + const int tid2 = get_local_id(0) * 2; + + sum = 0; + + sum = sum + co3 * smem[2 + tid2 - 2]; + sum = sum + co2 * smem[2 + tid2 - 1]; + sum = sum + co1 * smem[2 + tid2 ]; + sum = sum + co2 * smem[2 + tid2 + 1]; + sum = sum + co3 * smem[2 + tid2 + 2]; + + const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; + + if (dst_x < dstCols) + dst[y * dstStep / 4 + dst_x] = round_uchar4_float4(sum); + } +} + +__kernel void pyrDown_C1_D5(__global float * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global float *dst, int dstStep, int dstOffset, int dstCols) +{ + const int x = get_group_id(0) * get_local_size(0) + get_local_id(0); + const int y = get_group_id(1); + + __local float smem[256 + 4]; + + float sum; + + const int src_y = 2*y; + const int last_row = srcRows - 1; + const int last_col = srcCols - 1; + + sum = 0; + + sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(x, last_col)]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(x, last_col)]; + sum = sum + 0.375f * ((__global float*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(x, last_col)]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(x, last_col)]; + sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(x, last_col)]; + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + sum = 0; + + sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(left_x, last_col)]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(left_x, last_col)]; + sum = sum + 0.375f * ((__global float*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(left_x, last_col)]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(left_x, last_col)]; + sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(left_x, last_col)]; + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + sum = 0; + + sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(right_x, last_col)]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(right_x, last_col)]; + sum = sum + 0.375f * ((__global float*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(right_x, last_col)]; + sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(right_x, last_col)]; + sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(right_x, last_col)]; + + smem[4 + get_local_id(0)] = sum; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) < 128) + { + const int tid2 = get_local_id(0) * 2; + + sum = 0; + + sum = sum + 0.0625f * smem[2 + tid2 - 2]; + sum = sum + 0.25f * smem[2 + tid2 - 1]; + sum = sum + 0.375f * smem[2 + tid2 ]; + sum = sum + 0.25f * smem[2 + tid2 + 1]; + sum = sum + 0.0625f * smem[2 + tid2 + 2]; + + const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; + + if (dst_x < dstCols) + dst[y * dstStep / 4 + dst_x] = sum; + } +} + +__kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global float4 *dst, int dstStep, int dstOffset, int dstCols) +{ + const int x = get_group_id(0) * get_local_size(0) + get_local_id(0); + const int y = get_group_id(1); + + __local float4 smem[256 + 4]; + + float4 sum; + + const int src_y = 2*y; + const int last_row = srcRows - 1; + const int last_col = srcCols - 1; + + float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f); + float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f); + float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f); + + sum = 0; + + sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(x, last_col)]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(x, last_col)]; + sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(x, last_col)]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(x, last_col)]; + sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(x, last_col)]; + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + sum = 0; + + sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)]; + sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(left_x, last_col)]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)]; + sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)]; + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + sum = 0; + + sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)]; + sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(right_x, last_col)]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)]; + sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)]; + + smem[4 + get_local_id(0)] = sum; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) < 128) + { + const int tid2 = get_local_id(0) * 2; + + sum = 0; + + sum = sum + co3 * smem[2 + tid2 - 2]; + sum = sum + co2 * smem[2 + tid2 - 1]; + sum = sum + co1 * smem[2 + tid2 ]; + sum = sum + co2 * smem[2 + tid2 + 1]; + sum = sum + co3 * smem[2 + tid2 + 2]; + + const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; + + if (dst_x < dstCols) + dst[y * dstStep / 16 + dst_x] = sum; + } +} diff --git a/modules/ocl/src/kernels/pyr_up.cl b/modules/ocl/src/kernels/pyr_up.cl new file mode 100644 index 0000000000..dd3ba43d1b --- /dev/null +++ b/modules/ocl/src/kernels/pyr_up.cl @@ -0,0 +1,750 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Zhang Chunpeng chunpeng@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +//#pragma OPENCL EXTENSION cl_amd_printf : enable + +uchar get_valid_uchar(uchar data) +{ + return (uchar)(data <= 255 ? data : data > 0 ? 255 : 0); +} +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_8UC1 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// +__kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst, + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + __local float s_srcPatch[10][10]; + __local float s_dstPatch[20][16]; + + + if( get_local_id(0) < 10 && get_local_id(1) < 10 ) + { + int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; + int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; + + srcx = abs(srcx); + srcx = min(srcCols - 1,srcx); + + srcy = abs(srcy); + srcy = min(srcRows -1 ,srcy); + + s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]); + + } + + barrier(CLK_LOCAL_MEM_FENCE); + + float sum = 0; + const int evenFlag = (int)((get_local_id(0) & 1) == 0); + const int oddFlag = (int)((get_local_id(0) & 1) != 0); + const bool eveny = ((get_local_id(1) & 1) == 0); + const int tidx = get_local_id(0); + + if(eveny) + { + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]; + } + + s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; + + if (get_local_id(1) < 2) + { + sum = 0; + + if (eveny) + { + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; + } + + s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; + } + + if (get_local_id(1) > 13) + { + sum = 0; + + if (eveny) + { + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; + } + s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0; + + const int tidy = get_local_id(1); + + sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)]; + sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][get_local_id(0)]; + sum = sum + 0.375f * s_dstPatch[2 + tidy ][get_local_id(0)]; + sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][get_local_id(0)]; + sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)]; + + if ((x < dstCols) && (y < dstRows)) + dst[x + y * dstStep] = (float)(4.0f * sum); + +} + +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_16UC1 ///////////////////////////////// +/////////////////////////////////////////////////////////////////////// +__kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst, + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + __local float s_srcPatch[10][10]; + __local float s_dstPatch[20][16]; + + srcStep = srcStep >> 1; + dstStep = dstStep >> 1; + srcOffset = srcOffset >> 1; + dstOffset = dstOffset >> 1; + + + if( get_local_id(0) < 10 && get_local_id(1) < 10 ) + { + int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; + int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; + + srcx = abs(srcx); + srcx = min(srcCols - 1,srcx); + + srcy = abs(srcy); + srcy = min(srcRows -1 ,srcy); + + s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]); + + } + + barrier(CLK_LOCAL_MEM_FENCE); + + float sum = 0; + + const int evenFlag = (int)((get_local_id(0) & 1) == 0); + const int oddFlag = (int)((get_local_id(0) & 1) != 0); + const bool eveny = ((get_local_id(1) & 1) == 0); + const int tidx = get_local_id(0); + + if(eveny) + { + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]; + } + + s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; + + if (get_local_id(1) < 2) + { + sum = 0; + + if (eveny) + { + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; + } + + s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; + } + + if (get_local_id(1) > 13) + { + sum = 0; + + if (eveny) + { + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; + } + s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0; + + const int tidy = get_local_id(1); + + sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)]; + sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][get_local_id(0)]; + sum = sum + 0.375f * s_dstPatch[2 + tidy ][get_local_id(0)]; + sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][get_local_id(0)]; + sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)]; + + if ((x < dstCols) && (y < dstRows)) + dst[x + y * dstStep] = (float)(4.0f * sum); + +} + +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_32FC1 ///////////////////////////////// +/////////////////////////////////////////////////////////////////////// +__kernel void pyrUp_C1_D5(__global float* src,__global float* dst, + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + __local float s_srcPatch[10][10]; + __local float s_dstPatch[20][16]; + + srcOffset = srcOffset >> 2; + dstOffset = dstOffset >> 2; + srcStep = srcStep >> 2; + dstStep = dstStep >> 2; + + + if( get_local_id(0) < 10 && get_local_id(1) < 10 ) + { + int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; + int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; + + srcx = abs(srcx); + srcx = min(srcCols - 1,srcx); + + srcy = abs(srcy); + srcy = min(srcRows -1 ,srcy); + + s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]); + + } + + barrier(CLK_LOCAL_MEM_FENCE); + + float sum = 0; + const int evenFlag = (int)((get_local_id(0) & 1) == 0); + const int oddFlag = (int)((get_local_id(0) & 1) != 0); + const bool eveny = ((get_local_id(1) & 1) == 0); + const int tidx = get_local_id(0); + + if(eveny) + { + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]; + } + + s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; + + if (get_local_id(1) < 2) + { + sum = 0; + + if (eveny) + { + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; + } + + s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; + } + + if (get_local_id(1) > 13) + { + sum = 0; + + if (eveny) + { + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; + } + s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0; + + const int tidy = get_local_id(1); + + sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)]; + sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][get_local_id(0)]; + sum = sum + 0.375f * s_dstPatch[2 + tidy ][get_local_id(0)]; + sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][get_local_id(0)]; + sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)]; + + if ((x < dstCols) && (y < dstRows)) + dst[x + y * dstStep] = (float)(4.0f * sum); + +} + +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_8UC4 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// +float4 covert_uchar4_to_float4(uchar4 data) +{ + float4 f4Data = {0,0,0,0}; + + f4Data.x = (float)data.x; + f4Data.y = (float)data.y; + f4Data.z = (float)data.z; + f4Data.w = (float)data.w; + + return f4Data; +} + + +uchar4 convert_float4_to_uchar4(float4 data) +{ + uchar4 u4Data; + + u4Data.x = get_valid_uchar(data.x); + u4Data.y = get_valid_uchar(data.y); + u4Data.z = get_valid_uchar(data.z); + u4Data.w = get_valid_uchar(data.w); + + return u4Data; +} + +float4 int_x_float4(int leftOpr,float4 rightOpr) +{ + float4 result = {0,0,0,0}; + + result.x = rightOpr.x * leftOpr; + result.y = rightOpr.y * leftOpr; + result.z = rightOpr.z * leftOpr; + result.w = rightOpr.w * leftOpr; + + return result; +} + +float4 float4_x_float4(float4 leftOpr,float4 rightOpr) +{ + float4 result; + + result.x = leftOpr.x * rightOpr.x; + result.y = leftOpr.y * rightOpr.y; + result.z = leftOpr.z * rightOpr.z; + result.w = leftOpr.w * rightOpr.w; + + return result; +} + +__kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst, + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + __local float4 s_srcPatch[10][10]; + __local float4 s_dstPatch[20][16]; + + srcOffset >>= 2; + dstOffset >>= 2; + srcStep >>= 2; + dstStep >>= 2; + + + if( get_local_id(0) < 10 && get_local_id(1) < 10 ) + { + int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; + int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; + + srcx = abs(srcx); + srcx = min(srcCols - 1,srcx); + + srcy = abs(srcy); + srcy = min(srcRows -1 ,srcy); + + s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + float4 sum = (float4)(0,0,0,0); + + const int evenFlag = (int)((get_local_id(0) & 1) == 0); + const int oddFlag = (int)((get_local_id(0) & 1) != 0); + const bool eveny = ((get_local_id(1) & 1) == 0); + const int tidx = get_local_id(0); + + float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f); + float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f); + float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f); + + + if(eveny) + { + sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]); + + } + + s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; + + if (get_local_id(1) < 2) + { + sum = 0; + + if (eveny) + { + sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx ) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]); + } + + s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; + } + + if (get_local_id(1) > 13) + { + sum = 0; + + if (eveny) + { + sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx ) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]); + + } + s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0; + + const int tidy = get_local_id(1); + + sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]); + sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]); + sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy ][get_local_id(0)]); + sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]); + sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]); + + if ((x < dstCols) && (y < dstRows)) + { + dst[x + y * dstStep] = convert_float4_to_uchar4(int_x_float4(4.0f,sum)); + } +} +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_16UC4 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// +float4 covert_ushort4_to_float4(ushort4 data) +{ + float4 f4Data = {0,0,0,0}; + + f4Data.x = (float)data.x; + f4Data.y = (float)data.y; + f4Data.z = (float)data.z; + f4Data.w = (float)data.w; + + return f4Data; +} + + +ushort4 convert_float4_to_ushort4(float4 data) +{ + ushort4 u4Data; + + u4Data.x = (float)data.x; + u4Data.y = (float)data.y; + u4Data.z = (float)data.z; + u4Data.w = (float)data.w; + + return u4Data; +} + + +__kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + __local float4 s_srcPatch[10][10]; + __local float4 s_dstPatch[20][16]; + + srcOffset >>= 3; + dstOffset >>= 3; + srcStep >>= 3; + dstStep >>= 3; + + + if( get_local_id(0) < 10 && get_local_id(1) < 10 ) + { + int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; + int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; + + srcx = abs(srcx); + srcx = min(srcCols - 1,srcx); + + srcy = abs(srcy); + srcy = min(srcRows -1 ,srcy); + + s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_ushort4_to_float4(src[srcx + srcy * srcStep]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + float4 sum = (float4)(0,0,0,0); + + const int evenFlag = (int)((get_local_id(0) & 1) == 0); + const int oddFlag = (int)((get_local_id(0) & 1) != 0); + const bool eveny = ((get_local_id(1) & 1) == 0); + const int tidx = get_local_id(0); + + float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f); + float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f); + float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f); + + + if(eveny) + { + sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]); + + } + + s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; + + if (get_local_id(1) < 2) + { + sum = 0; + + if (eveny) + { + sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx ) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]); + } + + s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; + } + + if (get_local_id(1) > 13) + { + sum = 0; + + if (eveny) + { + sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx ) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]); + + } + s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0; + + const int tidy = get_local_id(1); + + sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]); + sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]); + sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy ][get_local_id(0)]); + sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]); + sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]); + + if ((x < dstCols) && (y < dstRows)) + { + dst[x + y * dstStep] = convert_float4_to_ushort4(int_x_float4(4.0f,sum)); + } +} + +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_32FC4 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// +__kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst, + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + __local float4 s_srcPatch[10][10]; + __local float4 s_dstPatch[20][16]; + + srcOffset >>= 4; + dstOffset >>= 4; + srcStep >>= 4; + dstStep >>= 4; + + + if( get_local_id(0) < 10 && get_local_id(1) < 10 ) + { + int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; + int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; + + srcx = abs(srcx); + srcx = min(srcCols - 1,srcx); + + srcy = abs(srcy); + srcy = min(srcRows -1 ,srcy); + + s_srcPatch[get_local_id(1)][get_local_id(0)] = (float4)(src[srcx + srcy * srcStep]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + float4 sum = (float4)(0,0,0,0); + + const int evenFlag = (int)((get_local_id(0) & 1) == 0); + const int oddFlag = (int)((get_local_id(0) & 1) != 0); + const bool eveny = ((get_local_id(1) & 1) == 0); + const int tidx = get_local_id(0); + + float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f); + float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f); + float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f); + + + if(eveny) + { + sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]); + + } + + s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; + + if (get_local_id(1) < 2) + { + sum = 0; + + if (eveny) + { + sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx ) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]); + } + + s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; + } + + if (get_local_id(1) > 13) + { + sum = 0; + + if (eveny) + { + sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx ) >> 1)]); + sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]); + sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]); + + } + s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0; + + const int tidy = get_local_id(1); + + sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]); + sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]); + sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy ][get_local_id(0)]); + sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]); + sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]); + + if ((x < dstCols) && (y < dstRows)) + { + dst[x + y * dstStep] = 4.0f * sum; + } +} \ No newline at end of file diff --git a/modules/ocl/src/match_template.cpp b/modules/ocl/src/match_template.cpp new file mode 100644 index 0000000000..ad31b00c68 --- /dev/null +++ b/modules/ocl/src/match_template.cpp @@ -0,0 +1,560 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Peng Xiao, pengxiao@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + + +#include +#include "precomp.hpp" + +using namespace cv; +using namespace cv::ocl; +using namespace std; + +#define EXT_FP64 0 + +#if !defined (HAVE_OPENCL) +void cv::ocl::matchTemplate(const oclMat&, const oclMat&, oclMat&) { throw_nogpu(); } +#else +//helper routines +namespace cv +{ + namespace ocl + { + ///////////////////////////OpenCL kernel strings/////////////////////////// + extern const char *match_template; + } +} + +namespace cv { namespace ocl +{ + void matchTemplate_SQDIFF( + const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf); + + void matchTemplate_SQDIFF_NORMED( + const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf); + + void matchTemplate_CCORR( + const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf); + + void matchTemplate_CCORR_NORMED( + const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf); + + void matchTemplate_CCOFF( + const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf); + + void matchTemplate_CCOFF_NORMED( + const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf); + + + void matchTemplateNaive_SQDIFF( + const oclMat& image, const oclMat& templ, oclMat& result, int cn); + + void matchTemplateNaive_CCORR( + const oclMat& image, const oclMat& templ, oclMat& result, int cn); + + // Evaluates optimal template's area threshold. If + // template's area is less than the threshold, we use naive match + // template version, otherwise FFT-based (if available) + int getTemplateThreshold(int method, int depth) + { + switch (method) + { + case CV_TM_CCORR: + if (depth == CV_32F) return 250; + if (depth == CV_8U) return 300; + break; + case CV_TM_SQDIFF: + if (depth == CV_32F) return MAXSHORT; // do naive SQDIFF for CV_32F + if (depth == CV_8U) return 300; + break; + } + CV_Error(CV_StsBadArg, "getTemplateThreshold: unsupported match template mode"); + return 0; + } + + + ////////////////////////////////////////////////////////////////////// + // SQDIFF + void matchTemplate_SQDIFF( + const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf) + { + result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); + if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth())) + { + matchTemplateNaive_SQDIFF(image, templ, result, image.channels()); + return; + } + else + { + // TODO + CV_Error(CV_StsBadArg, "Not supported yet for this size template"); + } + } + + void matchTemplate_SQDIFF_NORMED( + const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf) + { + matchTemplate_CCORR(image,templ,result,buf); + buf.image_sums.resize(1); + buf.image_sqsums.resize(1); + + integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]); + +#if EXT_FP64 && SQRSUM_FIXED + unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0]; +#else + Mat sqr_mat = templ.reshape(1); + unsigned long long templ_sqsum = (unsigned long long)sum(sqr_mat.mul(sqr_mat))[0]; +#endif + + Context *clCxt = image.clCxt; + string kernelName = "matchTemplate_Prepared_SQDIFF_NORMED"; + vector< pair > args; + + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data)); + args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); + + size_t globalThreads[3] = {result.cols, result.rows, 1}; + size_t localThreads[3] = {32, 8, 1}; + openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U); + } + + void matchTemplateNaive_SQDIFF( + const oclMat& image, const oclMat& templ, oclMat& result, int cn) + { + CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U ) + || (image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F); + CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.channels() == 4) && result.channels() == 1); + CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1); + + Context *clCxt = image.clCxt; + string kernelName = "matchTemplate_Naive_SQDIFF"; + + vector< pair > args; + + args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data)); + args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&image.step)); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); + + size_t globalThreads[3] = {result.cols, result.rows, 1}; + size_t localThreads[3] = {32, 8, 1}; + openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth()); + } + + ////////////////////////////////////////////////////////////////////// + // CCORR + void matchTemplate_CCORR( + const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf) + { + result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); + if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth())) + { + matchTemplateNaive_CCORR(image, templ, result, image.channels()); + return; + } + else + { + CV_Error(CV_StsBadArg, "Not supported yet for this size template"); + if(image.depth() == CV_8U && templ.depth() == CV_8U) + { + image.convertTo(buf.imagef, CV_32F); + templ.convertTo(buf.templf, CV_32F); + } + CV_Assert(image.channels() == 1); + oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.channels())); + filter2D(buf.imagef,o_result,CV_32F,buf.templf, Point(0,0)); + result = o_result(Rect(0,0,image.rows - templ.rows + 1, image.cols - templ.cols + 1)); + } + } + + void matchTemplate_CCORR_NORMED( + const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf) + { + matchTemplate_CCORR(image,templ,result,buf); + buf.image_sums.resize(1); + buf.image_sqsums.resize(1); + + integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]); +#if EXT_FP64 && SQRSUM_FIXED + unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0]; +#elif EXT_FP64 + oclMat templ_c1 = templ.reshape(1); + multiply(templ_c1, templ_c1, templ_c1); + unsigned long long templ_sqsum = (unsigned long long)sum(templ_c1)[0]; +#else + Mat m_templ_c1 = templ.reshape(1); + multiply(m_templ_c1, m_templ_c1, m_templ_c1); + unsigned long long templ_sqsum = (unsigned long long)sum(m_templ_c1)[0]; +#endif + Context *clCxt = image.clCxt; + string kernelName = "normalizeKernel"; + vector< pair > args; + + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data)); + args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); + + size_t globalThreads[3] = {result.cols, result.rows, 1}; + size_t localThreads[3] = {32, 8, 1}; + openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U); + } + + void matchTemplateNaive_CCORR( + const oclMat& image, const oclMat& templ, oclMat& result, int cn) + { + CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U ) + || (image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F); + CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.channels() == 4) && result.channels() == 1); + CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1); + + Context *clCxt = image.clCxt; + string kernelName = "matchTemplate_Naive_CCORR"; + + vector< pair > args; + + args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data)); + args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&image.step)); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); + + size_t globalThreads[3] = {result.cols, result.rows, 1}; + size_t localThreads[3] = {32, 8, 1}; + openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth()); + } + ////////////////////////////////////////////////////////////////////// + // CCOFF + void matchTemplate_CCOFF( + const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf) + { + CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U); + + matchTemplate_CCORR(image,templ,result,buf); + + Context *clCxt = image.clCxt; + string kernelName; + + kernelName = "matchTemplate_Prepared_CCOFF"; + size_t globalThreads[3] = {result.cols, result.rows, 1}; + size_t localThreads[3] = {32, 8, 1}; + + vector< pair > args; + args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); + // to be continued in the following section + if(image.channels() == 1) + { + buf.image_sums.resize(1); + // FIXME: temp fix for incorrect integral kernel + oclMat tmp_oclmat; + integral(image, buf.image_sums[0], tmp_oclmat); + + float templ_sum = 0; +#if EXT_FP64 + templ_sum = (float)sum(templ)[0] / templ.size().area(); +#else + Mat o_templ = templ; + templ_sum = (float)sum(o_templ)[0] / o_templ.size().area(); // temp fix for non-double supported machine +#endif + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step) ); + args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum) ); + } + else + { + Vec4f templ_sum = Vec4f::all(0); +#if EXT_FP64 + split(image,buf.images); + templ_sum = sum(templ) / templ.size().area(); +#else + // temp fix for non-double supported machine + Mat o_templ = templ, o_image = image; + vector o_mat_vector; + o_mat_vector.resize(image.channels()); + buf.images.resize(image.channels()); + split(o_image, o_mat_vector); + for(int i = 0; i < o_mat_vector.size(); i ++) + { + buf.images[i] = oclMat(o_mat_vector[i]); + } + templ_sum = sum(o_templ) / templ.size().area(); +#endif + buf.image_sums.resize(buf.images.size()); + + for(int i = 0; i < image.channels(); i ++) + { + // FIXME: temp fix for incorrect integral kernel + oclMat omat_temp; + integral(buf.images[i], buf.image_sums[i], omat_temp); + } + switch(image.channels()) + { + case 4: + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data) ); + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[1].data) ); + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[2].data) ); + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[3].data) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step) ); + args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[0]) ); + args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[1]) ); + args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[2]) ); + args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[3]) ); + break; + default: + CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels"); + break; + } + } + openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth()); + } + + void matchTemplate_CCOFF_NORMED( + const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf) + { + image.convertTo(buf.imagef, CV_32F); + templ.convertTo(buf.templf, CV_32F); + + matchTemplate_CCORR(buf.imagef, buf.templf, result, buf); + float scale = 1.f/templ.size().area(); + + Context *clCxt = image.clCxt; + string kernelName; + + kernelName = "matchTemplate_Prepared_CCOFF_NORMED"; + size_t globalThreads[3] = {result.cols, result.rows, 1}; + size_t localThreads[3] = {32, 8, 1}; + + vector< pair > args; + args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); + args.push_back( make_pair( sizeof(cl_float),(void *)&scale) ); + // to be continued in the following section + if(image.channels() == 1) + { + buf.image_sums.resize(1); + buf.image_sqsums.resize(1); + integral(image, buf.image_sums[0], buf.image_sqsums[0]); + float templ_sum = 0; + float templ_sqsum = 0; +#if EXT_FP64 + templ_sum = (float)sum(templ)[0]; +#if SQRSUM_FIXED + templ_sqsum = sqrSum(templ); +#else + oclMat templ_sqr = templ; + multiply(templ,templ, templ_sqr); + templ_sqsum = sum(templ_sqr)[0]; +#endif //SQRSUM_FIXED + templ_sqsum -= scale * templ_sum * templ_sum; + templ_sum *= scale; +#else + // temp fix for non-double supported machine + Mat o_templ = templ; + templ_sum = (float)sum(o_templ)[0]; + templ_sqsum = sum(o_templ.mul(o_templ))[0] - scale * templ_sum * templ_sum; + templ_sum *= scale; +#endif + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step) ); + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step) ); + args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum) ); + args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sqsum) ); + } + else + { + Vec4f templ_sum = Vec4f::all(0); + Vec4f templ_sqsum = Vec4f::all(0); +#if EXT_FP64 + split(image,buf.images); + templ_sum = sum(templ); +#if SQRSUM_FIXED + templ_sqsum = sqrSum(templ); +#else + oclMat templ_sqr = templ; + multiply(templ,templ, templ_sqr); + templ_sqsum = sum(templ_sqr); +#endif //SQRSUM_FIXED + templ_sqsum -= scale * templ_sum * templ_sum; + +#else + // temp fix for non-double supported machine + Mat o_templ = templ, o_image = image; + + vector o_mat_vector; + o_mat_vector.resize(image.channels()); + buf.images.resize(image.channels()); + split(o_image, o_mat_vector); + for(int i = 0; i < o_mat_vector.size(); i ++) + { + buf.images[i] = oclMat(o_mat_vector[i]); + } + templ_sum = sum(o_templ); + templ_sqsum = sum(o_templ.mul(o_templ)); +#endif + float templ_sqsum_sum = 0; + for(int i = 0; i < image.channels(); i ++) + { + templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i]; + } + templ_sum *= scale; + buf.image_sums.resize(buf.images.size()); + buf.image_sqsums.resize(buf.images.size()); + + for(int i = 0; i < image.channels(); i ++) + { + integral(buf.images[i], buf.image_sums[i], buf.image_sqsums[i]); + } + + switch(image.channels()) + { + case 4: + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data) ); + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[1].data) ); + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[2].data) ); + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[3].data) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step) ); + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data) ); + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[1].data) ); + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[2].data) ); + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[3].data) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step) ); + args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[0]) ); + args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[1]) ); + args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[2]) ); + args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[3]) ); + args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sqsum_sum) ); + break; + default: + CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels"); + break; + } + } + openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth()); + } + +}/*ocl*/} /*cv*/ + +void cv::ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method) +{ + MatchTemplateBuf buf; + matchTemplate(image,templ, result, method,buf); +} +void cv::ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method, MatchTemplateBuf& buf) +{ + CV_Assert(image.type() == templ.type()); + CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows); + + typedef void (*Caller)(const oclMat&, const oclMat&, oclMat&, MatchTemplateBuf&); + + const Caller callers[] = { + ::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED, + ::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED, + ::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED + }; + + Caller caller = callers[method]; + CV_Assert(caller); + caller(image, templ, result, buf); +} +#endif // diff --git a/modules/ocl/src/pyrdown.cpp b/modules/ocl/src/pyrdown.cpp new file mode 100644 index 0000000000..3f0a241cf7 --- /dev/null +++ b/modules/ocl/src/pyrdown.cpp @@ -0,0 +1,115 @@ + +#include "precomp.hpp" + +using namespace cv; +using namespace cv::ocl; +using namespace std; + +using std::cout; +using std::endl; + +namespace cv +{ + namespace ocl + { + ///////////////////////////OpenCL kernel strings/////////////////////////// + extern const char *pyr_down; + + } +} + +////////////////////////////////////////////////////////////////////////////// +/////////////////////// add subtract multiply divide ///////////////////////// +////////////////////////////////////////////////////////////////////////////// +template +void pyrdown_run(const oclMat &src, const oclMat &dst) +{ + CV_Assert(src.cols / 2 == dst.cols && src.rows / 2 == dst.rows); + + CV_Assert(src.type() == dst.type()); + CV_Assert(src.depth() != CV_8S); + + Context *clCxt = src.clCxt; + //int channels = dst.channels(); + //int depth = dst.depth(); + + string kernelName = "pyrDown"; + + //int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1}, + // {4, 0, 4, 4, 1, 1, 1}, + // {4, 0, 4, 4, 1, 1, 1}, + // {4, 0, 4, 4, 1, 1, 1} + //}; + + //size_t vector_length = vector_lengths[channels-1][depth]; + //int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1); + + size_t localThreads[3] = { 256, 1, 1 }; + size_t globalThreads[3] = { src.cols, dst.rows, 1}; + + //int dst_step1 = dst.cols * dst.elemSize(); + vector > args; + args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.step )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols)); + + openCLExecuteKernel(clCxt, &pyr_down, kernelName, globalThreads, localThreads, args, src.channels(), src.depth()); +} +void pyrdown_run(const oclMat &src, const oclMat &dst) +{ + switch(src.depth()) + { + case 0: + pyrdown_run(src, dst); + break; + + case 1: + pyrdown_run(src, dst); + break; + + case 2: + pyrdown_run(src, dst); + break; + + case 3: + pyrdown_run(src, dst); + break; + + case 4: + pyrdown_run(src, dst); + break; + + case 5: + pyrdown_run(src, dst); + break; + + case 6: + pyrdown_run(src, dst); + break; + + default: + break; + } +} +////////////////////////////////////////////////////////////////////////////// +// pyrDown + +void cv::ocl::pyrDown(const oclMat& src, oclMat& dst) +{ + CV_Assert(src.depth() <= CV_32F && src.channels() <= 4); + + //src.step = src.rows; + + dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type()); + + //dst.step = dst.rows; + + pyrdown_run(src, dst); +} + diff --git a/modules/ocl/src/pyrup.cpp b/modules/ocl/src/pyrup.cpp new file mode 100644 index 0000000000..ee0dfe382d --- /dev/null +++ b/modules/ocl/src/pyrup.cpp @@ -0,0 +1,88 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Zhang Chunpeng chunpeng@multicorewareinc.com +// +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/* Haar features calculation */ +//#define EMU + +#include "precomp.hpp" + +using namespace cv; +using namespace cv::ocl; +using namespace std; + +#ifndef HAVE_OPENCL +void cv::ocl::pyrUp(const oclMat&, GpuMat&, oclMat&) { throw_nogpu(); } +#else + +namespace cv { namespace ocl +{ + extern const char *pyr_up; + void pyrUp(const cv::ocl::oclMat& src,cv::ocl::oclMat& dst) + { + dst.create(src.rows * 2, src.cols * 2, src.type()); + Context *clCxt = src.clCxt; + + const std::string kernelName = "pyrUp"; + + std::vector< pair > args; + args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.step)); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step)); + + size_t globalThreads[3] = {dst.cols, dst.rows, 1}; + size_t localThreads[3] = {16, 16, 1}; + + openCLExecuteKernel(clCxt, &pyr_up, kernelName, globalThreads, localThreads, args, src.channels(), src.depth()); + } +}}; +#endif // HAVE_OPENCL \ No newline at end of file diff --git a/modules/ocl/test/test_blend.cpp b/modules/ocl/test/test_blend.cpp new file mode 100644 index 0000000000..a0391b1bb0 --- /dev/null +++ b/modules/ocl/test/test_blend.cpp @@ -0,0 +1,83 @@ +#include "precomp.hpp" +#include + +using namespace cv; +using namespace cv::ocl; +using namespace cvtest; +using namespace testing; +using namespace std; + +template +void blendLinearGold(const cv::Mat& img1, const cv::Mat& img2, const cv::Mat& weights1, const cv::Mat& weights2, cv::Mat& result_gold) +{ + result_gold.create(img1.size(), img1.type()); + + int cn = img1.channels(); + + for (int y = 0; y < img1.rows; ++y) + { + const float* weights1_row = weights1.ptr(y); + const float* weights2_row = weights2.ptr(y); + const T* img1_row = img1.ptr(y); + const T* img2_row = img2.ptr(y); + T* result_gold_row = result_gold.ptr(y); + + for (int x = 0; x < img1.cols * cn; ++x) + { + float w1 = weights1_row[x / cn]; + float w2 = weights2_row[x / cn]; + result_gold_row[x] = static_cast((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f)); + } + } +} + +PARAM_TEST_CASE(Blend, cv::Size, MatType/*, UseRoi*/) +{ + std::vector oclinfo; + cv::Size size; + int type; + bool useRoi; + + virtual void SetUp() + { + //devInfo = GET_PARAM(0); + size = GET_PARAM(0); + type = GET_PARAM(1); + /*useRoi = GET_PARAM(3);*/ + + int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); + CV_Assert(devnums > 0); + } +}; + +TEST_P(Blend, Accuracy) +{ + int depth = CV_MAT_DEPTH(type); + + cv::Mat img1 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0); + cv::Mat img2 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0); + cv::Mat weights1 = randomMat(size, CV_32F, 0, 1); + cv::Mat weights2 = randomMat(size, CV_32F, 0, 1); + + cv::ocl::oclMat gimg1(size, type), gimg2(size, type), gweights1(size, CV_32F), gweights2(size, CV_32F); + cv::ocl::oclMat dst(size, type); + gimg1.upload(img1); + gimg2.upload(img2); + gweights1.upload(weights1); + gweights2.upload(weights2); + cv::ocl::blendLinear(gimg1, gimg2, gweights1, gweights2, dst); + cv::Mat result; + cv::Mat result_gold; + dst.download(result); + if (depth == CV_8U) + blendLinearGold(img1, img2, weights1, weights2, result_gold); + else + blendLinearGold(img1, img2, weights1, weights2, result_gold); + + EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1 : 1e-5f, NULL) +} + +INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, Combine( + DIFFERENT_SIZES, + testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)) +)); \ No newline at end of file diff --git a/modules/ocl/test/test_columnsum.cpp b/modules/ocl/test/test_columnsum.cpp new file mode 100644 index 0000000000..94e109d200 --- /dev/null +++ b/modules/ocl/test/test_columnsum.cpp @@ -0,0 +1,108 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Chunpeng Zhang chunpeng@multicorewareinc.com +// +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" +#include + +/////////////////////////////////////////////////////////////////////////////// +/// ColumnSum + +#ifdef HAVE_OPENCL + +//////////////////////////////////////////////////////////////////////// +// ColumnSum + +PARAM_TEST_CASE(ColumnSum, cv::Size, bool ) +{ + cv::Size size; + cv::Mat src; + bool useRoi; + std::vector oclinfo; + + virtual void SetUp() + { + size = GET_PARAM(0); + useRoi = GET_PARAM(1); + int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); + CV_Assert(devnums > 0); + } +}; + +TEST_P(ColumnSum, Accuracy) +{ + cv::Mat src = randomMat(size, CV_32FC1); + //cv::Mat src(size,CV_32FC1); + + //cv::ocl::oclMat d_dst = ::createMat(size,src.type(),useRoi); + cv::ocl::oclMat d_dst = loadMat(src,useRoi); + + cv::ocl::columnSum(loadMat(src,useRoi),d_dst); + + cv::Mat dst(d_dst); + + for (int j = 0; j < src.cols; ++j) + { + float gold = src.at(0, j); + float res = dst.at(0, j); + ASSERT_NEAR(res, gold, 1e-5); + } + + for (int i = 1; i < src.rows; ++i) + { + for (int j = 0; j < src.cols; ++j) + { + float gold = src.at(i, j) += src.at(i - 1, j); + float res = dst.at(i, j); + ASSERT_NEAR(res, gold, 1e-5); + } + } +} + +INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ColumnSum, testing::Combine( + DIFFERENT_SIZES,testing::Values(Inverse(false),Inverse(true)))); + + +#endif diff --git a/modules/ocl/test/test_fft.cpp b/modules/ocl/test/test_fft.cpp new file mode 100644 index 0000000000..4b51d4feca --- /dev/null +++ b/modules/ocl/test/test_fft.cpp @@ -0,0 +1,97 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Peng Xiao, pengxiao@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" +using namespace std; +#ifdef HAVE_CLAMDFFT +//////////////////////////////////////////////////////////////////////////// +// Dft +PARAM_TEST_CASE(Dft, cv::Size, bool) +{ + cv::Size dft_size; + bool dft_rows; + std::vector oclinfo; + virtual void SetUp() + { + int devnums = getDevice(oclinfo); + CV_Assert(devnums > 0); + dft_size = GET_PARAM(0); + dft_rows = GET_PARAM(1); + } +}; + +TEST_P(Dft, C2C) +{ + cv::Mat a = randomMat(dft_size, CV_32FC2, 0.0, 10.0); + cv::Mat b_gold; + int flags = 0; + flags |= dft_rows ? cv::DFT_ROWS : 0; + + cv::ocl::oclMat d_b; + + cv::dft(a, b_gold, flags); + cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), flags); + EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), a.size().area() * 1e-4, ""); +} + + +TEST_P(Dft, R2CthenC2R) +{ + cv::Mat a = randomMat(dft_size, CV_32FC1, 0.0, 10.0); + + int flags = 0; + //flags |= dft_rows ? cv::DFT_ROWS : 0; // not supported yet + + cv::ocl::oclMat d_b, d_c; + cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), flags); + cv::ocl::dft(d_b, d_c, a.size(), flags + cv::DFT_INVERSE + cv::DFT_REAL_OUTPUT); + EXPECT_MAT_NEAR(a, d_c, a.size().area() * 1e-4, ""); +} + +INSTANTIATE_TEST_CASE_P(ocl_DFT, Dft, testing::Combine( + testing::Values(cv::Size(5, 4), cv::Size(20, 20)), + testing::Values(false, true))); + +#endif // HAVE_CLAMDFFT diff --git a/modules/ocl/test/test_gemm.cpp b/modules/ocl/test/test_gemm.cpp new file mode 100644 index 0000000000..a836149cb0 --- /dev/null +++ b/modules/ocl/test/test_gemm.cpp @@ -0,0 +1,85 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Peng Xiao, pengxiao@multicorewareinc.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + + +#include "precomp.hpp" +using namespace std; +#ifdef HAVE_CLAMDBLAS +//////////////////////////////////////////////////////////////////////////// +// GEMM +PARAM_TEST_CASE(Gemm, int, cv::Size, int) +{ + int type; + cv::Size mat_size; + int flags; + vector info; + virtual void SetUp() + { + type = GET_PARAM(0); + mat_size = GET_PARAM(1); + flags = GET_PARAM(2); + cv::ocl::getDevice(info); + } +}; + +TEST_P(Gemm, Accuracy) +{ + cv::Mat a = randomMat(mat_size, type, 0.0, 10.0); + cv::Mat b = randomMat(mat_size, type, 0.0, 10.0); + cv::Mat c = randomMat(mat_size, type, 0.0, 10.0); + + cv::Mat dst; + cv::ocl::oclMat ocl_dst; + + cv::gemm(a, b, 1.0, c, 1.0, dst, flags); + cv::ocl::gemm(cv::ocl::oclMat(a), cv::ocl::oclMat(b), 1.0, cv::ocl::oclMat(c), 1.0, ocl_dst, flags); + + EXPECT_MAT_NEAR(dst, ocl_dst, mat_size.area() * 1e-4, ""); +} + +INSTANTIATE_TEST_CASE_P(ocl_gemm, Gemm, testing::Combine( + testing::Values(CV_32FC1, CV_32FC2/*, CV_64FC1, CV_64FC2*/), + testing::Values(cv::Size(20, 20), cv::Size(300, 300)), + testing::Values(0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_1_T + cv::GEMM_2_T))); +#endif diff --git a/modules/ocl/test/test_match_template.cpp b/modules/ocl/test/test_match_template.cpp new file mode 100644 index 0000000000..7d599a6152 --- /dev/null +++ b/modules/ocl/test/test_match_template.cpp @@ -0,0 +1,172 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Peng Xiao, pengxiao@multicorewareinc.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + + +#include "precomp.hpp" +#define PERF_TEST 0 + +//////////////////////////////////////////////////////////////////////////////// +// MatchTemplate +#define ALL_TEMPLATE_METHODS testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR), TemplateMethod(cv::TM_CCOEFF), TemplateMethod(cv::TM_SQDIFF_NORMED), TemplateMethod(cv::TM_CCORR_NORMED), TemplateMethod(cv::TM_CCOEFF_NORMED)) + +IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size); + +const char* TEMPLATE_METHOD_NAMES[6] = {"TM_SQDIFF", "TM_SQDIFF_NORMED", "TM_CCORR", "TM_CCORR_NORMED", "TM_CCOEFF", "TM_CCOEFF_NORMED"}; + +PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMethod) +{ + cv::Size size; + cv::Size templ_size; + int cn; + int method; + std::vector oclinfo; + + virtual void SetUp() + { + size = GET_PARAM(0); + templ_size = GET_PARAM(1); + cn = GET_PARAM(2); + method = GET_PARAM(3); + int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); + CV_Assert(devnums > 0); + } +}; + +TEST_P(MatchTemplate8U, Accuracy) +{ + + std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl; + std::cout << "Image Size: (" << size.width << ", " << size.height << ")"<< std::endl; + std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")"<< std::endl; + std::cout << "Channels: " << cn << std::endl; + + cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn)); + cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn)); + + cv::ocl::oclMat dst, ocl_image(image), ocl_templ(templ); + cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method); + + cv::Mat dst_gold; + cv::matchTemplate(image, templ, dst_gold, method); + + char sss [100] = ""; + + cv::Mat mat_dst; + dst.download(mat_dst); + + + EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1, sss); + +#if PERF_TEST + { + P_TEST_FULL({}, {cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);}, {}); + P_TEST_FULL({}, {cv::matchTemplate(image, templ, dst_gold, method);}, {}); + } +#endif // PERF_TEST +} + +PARAM_TEST_CASE(MatchTemplate32F, cv::Size, TemplateSize, Channels, TemplateMethod) +{ + cv::Size size; + cv::Size templ_size; + int cn; + int method; + std::vector oclinfo; + + virtual void SetUp() + { + size = GET_PARAM(0); + templ_size = GET_PARAM(1); + cn = GET_PARAM(2); + method = GET_PARAM(3); + int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); + CV_Assert(devnums > 0); + } +}; + +TEST_P(MatchTemplate32F, Accuracy) +{ + cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn)); + cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn)); + + cv::ocl::oclMat dst, ocl_image(image), ocl_templ(templ); + cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method); + + cv::Mat dst_gold; + cv::matchTemplate(image, templ, dst_gold, method); + + char sss [100] = ""; + + cv::Mat mat_dst; + dst.download(mat_dst); + + EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1, sss); + +#if PERF_TEST + { + std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl; + std::cout << "Image Size: (" << size.width << ", " << size.height << ")"<< std::endl; + std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")"<< std::endl; + std::cout << "Channels: " << cn << std::endl; + P_TEST_FULL({}, {cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);}, {}); + P_TEST_FULL({}, {cv::matchTemplate(image, templ, dst_gold, method);}, {}); + } +#endif // PERF_TEST +} + +INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U, + testing::Combine( + DIFFERENT_SIZES, + testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/), + testing::Values(Channels(1), Channels(4)), + ALL_TEMPLATE_METHODS + ) +); + +INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine( + DIFFERENT_SIZES, + testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/), + testing::Values(Channels(1), Channels(4)), + testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR)))); + diff --git a/modules/ocl/test/test_pyrdown.cpp b/modules/ocl/test/test_pyrdown.cpp new file mode 100644 index 0000000000..f2270b4a8c --- /dev/null +++ b/modules/ocl/test/test_pyrdown.cpp @@ -0,0 +1,295 @@ +/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Dachuan Zhao, dachuan@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +//#define PRINT_CPU_TIME 1000 +//#define PRINT_TIME + + +#include "precomp.hpp" +#include + +#ifdef HAVE_OPENCL + +using namespace cv; +using namespace cv::ocl; +using namespace cvtest; +using namespace testing; +using namespace std; + +PARAM_TEST_CASE(PyrDown, MatType, bool) +{ + int type; + cv::Scalar val; + + //src mat + cv::Mat mat1; + cv::Mat mat2; + cv::Mat mask; + cv::Mat dst; + cv::Mat dst1; //bak, for two outputs + + // set up roi + int roicols; + int roirows; + int src1x; + int src1y; + int src2x; + int src2y; + int dstx; + int dsty; + int maskx; + int masky; + + + //src mat with roi + cv::Mat mat1_roi; + cv::Mat mat2_roi; + cv::Mat mask_roi; + cv::Mat dst_roi; + cv::Mat dst1_roi; //bak + std::vector oclinfo; + //ocl dst mat for testing + cv::ocl::oclMat gdst_whole; + cv::ocl::oclMat gdst1_whole; //bak + + //ocl mat with roi + cv::ocl::oclMat gmat1; + cv::ocl::oclMat gmat2; + cv::ocl::oclMat gdst; + cv::ocl::oclMat gdst1; //bak + cv::ocl::oclMat gmask; + + virtual void SetUp() + { + type = GET_PARAM(0); + + cv::RNG &rng = TS::ptr()->get_rng(); + + cv::Size size(MWIDTH, MHEIGHT); + + mat1 = randomMat(rng, size, type, 5, 16, false); + mat2 = randomMat(rng, size, type, 5, 16, false); + dst = randomMat(rng, size, type, 5, 16, false); + dst1 = randomMat(rng, size, type, 5, 16, false); + mask = randomMat(rng, size, CV_8UC1, 0, 2, false); + + cv::threshold(mask, mask, 0.5, 255., CV_8UC1); + + val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0)); + + int devnums = getDevice(oclinfo); + CV_Assert(devnums > 0); + //if you want to use undefault device, set it here + //setDevice(oclinfo[0]); + } + + void Cleanup() + { + mat1.release(); + mat2.release(); + mask.release(); + dst.release(); + dst1.release(); + mat1_roi.release(); + mat2_roi.release(); + mask_roi.release(); + dst_roi.release(); + dst1_roi.release(); + + gdst_whole.release(); + gdst1_whole.release(); + gmat1.release(); + gmat2.release(); + gdst.release(); + gdst1.release(); + gmask.release(); + } + + void random_roi() + { + cv::RNG &rng = TS::ptr()->get_rng(); + +#ifdef RANDOMROI + //randomize ROI + roicols = rng.uniform(1, mat1.cols); + roirows = rng.uniform(1, mat1.rows); + src1x = rng.uniform(0, mat1.cols - roicols); + src1y = rng.uniform(0, mat1.rows - roirows); + dstx = rng.uniform(0, dst.cols - roicols); + dsty = rng.uniform(0, dst.rows - roirows); +#else + roicols = mat1.cols; + roirows = mat1.rows; + src1x = 0; + src1y = 0; + dstx = 0; + dsty = 0; +#endif + maskx = rng.uniform(0, mask.cols - roicols); + masky = rng.uniform(0, mask.rows - roirows); + src2x = rng.uniform(0, mat2.cols - roicols); + src2y = rng.uniform(0, mat2.rows - roirows); + mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows)); + mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows)); + mask_roi = mask(Rect(maskx, masky, roicols, roirows)); + dst_roi = dst(Rect(dstx, dsty, roicols, roirows)); + dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows)); + + gdst_whole = dst; + gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows)); + + gdst1_whole = dst1; + gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows)); + + gmat1 = mat1_roi; + gmat2 = mat2_roi; + gmask = mask_roi; //end + } + +}; + +#define VARNAME(A) string(#A); + + +void PrePrint() +{ + //for(int i = 0; i < MHEIGHT; i++) + //{ + // printf("(%d) ", i); + // for(int k = 0; k < MWIDTH; k++) + // { + // printf("%d ", mat1_roi.data[i * MHEIGHT + k]); + // } + // printf("\n"); + //} +} + +void PostPrint() +{ + //dst_roi.convertTo(dst_roi,CV_32S); + //cpu_dst.convertTo(cpu_dst,CV_32S); + //dst_roi -= cpu_dst; + //cpu_dst -= dst_roi; + //for(int i = 0; i < MHEIGHT / 2; i++) + //{ + // printf("(%d) ", i); + // for(int k = 0; k < MWIDTH / 2; k++) + // { + // if(gmat1.depth() == 0) + // { + // if(gmat1.channels() == 1) + // { + // printf("%d ", dst_roi.data[i * MHEIGHT / 2 + k]); + // } + // else + // { + // printf("%d ", ((unsigned*)dst_roi.data)[i * MHEIGHT / 2 + k]); + // } + // } + // else if(gmat1.depth() == 5) + // { + // printf("%.6f ", ((float*)dst_roi.data)[i * MHEIGHT / 2 + k]); + // } + // } + // printf("\n"); + //} + //for(int i = 0; i < MHEIGHT / 2; i++) + //{ + // printf("(%d) ", i); + // for(int k = 0; k < MWIDTH / 2; k++) + // { + // if(gmat1.depth() == 0) + // { + // if(gmat1.channels() == 1) + // { + // printf("%d ", cpu_dst.data[i * MHEIGHT / 2 + k]); + // } + // else + // { + // printf("%d ", ((unsigned*)cpu_dst.data)[i * MHEIGHT / 2 + k]); + // } + // } + // else if(gmat1.depth() == 5) + // { + // printf("%.6f ", ((float*)cpu_dst.data)[i * MHEIGHT / 2 + k]); + // } + // } + // printf("\n"); + //} +} + +////////////////////////////////PyrDown///////////////////////////////////////////////// +//struct PyrDown : ArithmTestBase {}; + +TEST_P(PyrDown, Mat) +{ + for(int j = 0; j < LOOP_TIMES; j++) + { + random_roi(); + + cv::pyrDown(mat1_roi, dst_roi); + cv::ocl::pyrDown(gmat1, gdst); + + cv::Mat cpu_dst; + gdst.download(cpu_dst); + char s[1024]; + sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); + + EXPECT_MAT_NEAR(dst_roi, cpu_dst, dst_roi.depth() == CV_32F ? 1e-5f : 1.0f, s); + + Cleanup(); + } +} + + + + +//********test**************** +INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, Combine( + Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), + Values(false))); // Values(false) is the reserved parameter + + +#endif // HAVE_OPENCL diff --git a/modules/ocl/test/test_pyrup.cpp b/modules/ocl/test/test_pyrup.cpp new file mode 100644 index 0000000000..c6c5b9c10c --- /dev/null +++ b/modules/ocl/test/test_pyrup.cpp @@ -0,0 +1,91 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Zhang Chunpeng chunpeng@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" +#include "opencv2/core/core.hpp" + +#ifdef HAVE_OPENCL + + +PARAM_TEST_CASE(PyrUp,cv::Size,int) +{ + cv::Size size; + int type; + std::vector oclinfo; + + virtual void SetUp() + { + int devnums = cv::ocl::getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); + CV_Assert(devnums > 0); + size = GET_PARAM(0); + type = GET_PARAM(1); + } +}; + +TEST_P(PyrUp,Accuracy) +{ + cv::Mat src = randomMat(size,type); + + + cv::Mat dst_gold; + cv::pyrUp(src,dst_gold); + + cv::ocl::oclMat dst; + cv::ocl::oclMat srcMat(src); + cv::ocl::pyrUp(srcMat,dst); + char s[100]={0}; + + EXPECT_MAT_NEAR(dst_gold, dst, (src.depth() == CV_32F ? 1e-4f : 1.0),s); + +} + +#if 1 +INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, testing::Combine( + testing::Values(cv::Size(32, 32)), + testing::Values(MatType(CV_8UC1),MatType(CV_16UC1),MatType(CV_32FC1),MatType(CV_8UC4), + MatType(CV_16UC4),MatType(CV_32FC4)))); +#endif + +#endif // HAVE_OPENCL \ No newline at end of file diff --git a/modules/ts/misc/run.py b/modules/ts/misc/run.py index 79dad8d8fd..737a2eea35 100644 --- a/modules/ts/misc/run.py +++ b/modules/ts/misc/run.py @@ -337,10 +337,9 @@ class RunInfo(object): def getSvnVersion(self, path, name): if not path: - setattr(self, name, None) - return - if not self.svnversion_path and hostos == 'nt': - self.tryGetSvnVersionWithTortoise(path, name) + val = None + elif not self.svnversion_path and hostos == 'nt': + val = self.tryGetSvnVersionWithTortoise(path, name) else: svnversion = self.svnversion_path if not svnversion: @@ -348,11 +347,14 @@ class RunInfo(object): try: output = Popen([svnversion, "-n", path], stdout=PIPE, stderr=PIPE).communicate() if not output[1]: - setattr(self, name, output[0]) + val = output[0] else: - setattr(self, name, None) + val = None except OSError: - setattr(self, name, None) + val = None + if val: + val = val.replace(" ", "_") + setattr(self, name, val) def tryGetSvnVersionWithTortoise(self, path, name): try: @@ -371,9 +373,9 @@ class RunInfo(object): tmpfile = open(tmpfilename2, "r") version = tmpfile.read() tmpfile.close() - setattr(self, name, version) + return version except: - setattr(self, name, None) + return None finally: if dir: shutil.rmtree(dir) diff --git a/modules/video/src/bgfg_gmg.cpp b/modules/video/src/bgfg_gmg.cpp index 163445a45b..b4e9824d69 100644 --- a/modules/video/src/bgfg_gmg.cpp +++ b/modules/video/src/bgfg_gmg.cpp @@ -440,8 +440,7 @@ bool BackgroundSubtractorGMG::HistogramFeatureGMG::operator ==(HistogramFeatureG std::vector::iterator color_a; std::vector::iterator color_b; std::vector::iterator color_a_end = this->color.end(); - std::vector::iterator color_b_end = rhs.color.end(); - for (color_a = color.begin(),color_b =rhs.color.begin();color_a!=color_a_end;++color_a,++color_b) + for (color_a = color.begin(), color_b = rhs.color.begin(); color_a != color_a_end; ++color_a, ++color_b) { if (*color_a != *color_b) {