diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index 363e2815e4..551ccd9a72 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -566,9 +566,6 @@ namespace cv { namespace gpu { namespace surf float* s_sum_row = s_sum + threadIdx.y * 32; - //reduceSum32(s_sum_row, sumx); - //reduceSum32(s_sum_row, sumy); - warpReduce32(s_sum_row, sumx, threadIdx.x, plus()); warpReduce32(s_sum_row, sumy, threadIdx.x, plus()); diff --git a/modules/gpu/src/opencv2/gpu/device/utility.hpp b/modules/gpu/src/opencv2/gpu/device/utility.hpp index edfbae01ac..a3b4dafa94 100644 --- a/modules/gpu/src/opencv2/gpu/device/utility.hpp +++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp @@ -46,13 +46,13 @@ #include "internal_shared.hpp" #include "saturate_cast.hpp" -#ifndef __CUDA_ARCH__ - #define __CUDA_ARCH__ 0 +#ifndef __CUDA_ARCH__ + #define __CUDA_ARCH__ 0 #endif -#define OPENCV_GPU_LOG_WARP_SIZE (5) -#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE) -#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla +#define OPENCV_GPU_LOG_WARP_SIZE (5) +#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE) +#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla #define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS) #if defined(_WIN64) || defined(__LP64__) @@ -65,15 +65,15 @@ namespace cv { namespace gpu { namespace device { - template void __host__ __device__ __forceinline__ swap(T &a, T &b) - { - T temp = a; - a = b; - b = temp; + template void __host__ __device__ __forceinline__ swap(T &a, T &b) + { + T temp = a; + a = b; + b = temp; } // warp-synchronous 32 elements reduction - template __device__ __forceinline__ void warpReduce32(volatile T* data, volatile T& partial_reduction, int tid, Op op) + template __device__ __forceinline__ void warpReduce32(volatile T* data, T& partial_reduction, int tid, Op op) { data[tid] = partial_reduction; @@ -88,7 +88,7 @@ namespace cv { namespace gpu { namespace device } // warp-synchronous 16 elements reduction - template __device__ __forceinline__ void warpReduce16(volatile T* data, volatile T& partial_reduction, int tid, Op op) + template __device__ __forceinline__ void warpReduce16(volatile T* data, T& partial_reduction, int tid, Op op) { data[tid] = partial_reduction; @@ -102,7 +102,7 @@ namespace cv { namespace gpu { namespace device } // warp-synchronous reduction - template __device__ __forceinline__ void warpReduce(volatile T* data, volatile T& partial_reduction, int tid, Op op) + template __device__ __forceinline__ void warpReduce(volatile T* data, T& partial_reduction, int tid, Op op) { if (tid < n) data[tid] = partial_reduction; diff --git a/modules/gpu/test/test_main.cpp b/modules/gpu/test/test_main.cpp index 641f6942f1..05afdcdece 100644 --- a/modules/gpu/test/test_main.cpp +++ b/modules/gpu/test/test_main.cpp @@ -109,9 +109,11 @@ int main(int argc, char** argv) cvtest::TS::ptr()->init("gpu"); testing::InitGoogleTest(&argc, argv); - //cv::CommandLineParser parser(argc, (const char**)argv); + const char* keys ="{ nvtest_output_level | nvtest_output_level | none | NVidia test verbosity level }"; - std::string outputLevel = "none";//parser.get("nvtest_output_level", "none"); + cv::CommandLineParser parser(argc, (const char**)argv, keys); + + std::string outputLevel = parser.get("nvtest_output_level", "none"); if (outputLevel == "none") nvidiaTestOutputLevel = OutputLevelNone;