diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu
index 363e2815e4..551ccd9a72 100644
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -566,9 +566,6 @@ namespace cv { namespace gpu { namespace surf
 
                 float* s_sum_row = s_sum + threadIdx.y * 32;
 
-                //reduceSum32(s_sum_row, sumx);
-                //reduceSum32(s_sum_row, sumy);
-
                 warpReduce32(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
                 warpReduce32(s_sum_row, sumy, threadIdx.x, plus<volatile float>());
 
diff --git a/modules/gpu/src/opencv2/gpu/device/utility.hpp b/modules/gpu/src/opencv2/gpu/device/utility.hpp
index edfbae01ac..a3b4dafa94 100644
--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
@@ -46,13 +46,13 @@
 #include "internal_shared.hpp"
 #include "saturate_cast.hpp"
 
-#ifndef __CUDA_ARCH__
-	#define __CUDA_ARCH__ 0
+#ifndef __CUDA_ARCH__
+	#define __CUDA_ARCH__ 0
 #endif
 
-#define OPENCV_GPU_LOG_WARP_SIZE	    (5)
-#define OPENCV_GPU_WARP_SIZE	        (1 << OPENCV_GPU_LOG_WARP_SIZE)
-#define OPENCV_GPU_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
+#define OPENCV_GPU_LOG_WARP_SIZE	    (5)
+#define OPENCV_GPU_WARP_SIZE	        (1 << OPENCV_GPU_LOG_WARP_SIZE)
+#define OPENCV_GPU_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
 #define OPENCV_GPU_MEM_BANKS            (1 << OPENCV_GPU_LOG_MEM_BANKS)
 
 #if defined(_WIN64) || defined(__LP64__)		
@@ -65,15 +65,15 @@
 
 namespace cv {  namespace gpu { namespace device
 {
-    template <typename T> void __host__ __device__ __forceinline__ swap(T &a, T &b) 
-    {
-        T temp = a;
-        a = b;
-        b = temp;
+    template <typename T> void __host__ __device__ __forceinline__ swap(T &a, T &b) 
+    {
+        T temp = a;
+        a = b;
+        b = temp;
     }
 
     // warp-synchronous 32 elements reduction
-    template <typename T, typename Op> __device__ __forceinline__ void warpReduce32(volatile T* data, volatile T& partial_reduction, int tid, Op op)
+    template <typename T, typename Op> __device__ __forceinline__ void warpReduce32(volatile T* data, T& partial_reduction, int tid, Op op)
     {
         data[tid] = partial_reduction;
 
@@ -88,7 +88,7 @@ namespace cv {  namespace gpu { namespace device
     }
 
     // warp-synchronous 16 elements reduction
-    template <typename T, typename Op> __device__ __forceinline__ void warpReduce16(volatile T* data, volatile T& partial_reduction, int tid, Op op)
+    template <typename T, typename Op> __device__ __forceinline__ void warpReduce16(volatile T* data, T& partial_reduction, int tid, Op op)
     {
         data[tid] = partial_reduction;
 
@@ -102,7 +102,7 @@ namespace cv {  namespace gpu { namespace device
     }
 
     // warp-synchronous reduction
-    template <int n, typename T, typename Op> __device__ __forceinline__ void warpReduce(volatile T* data, volatile T& partial_reduction, int tid, Op op)
+    template <int n, typename T, typename Op> __device__ __forceinline__ void warpReduce(volatile T* data, T& partial_reduction, int tid, Op op)
     {
         if (tid < n)
             data[tid] = partial_reduction;
diff --git a/modules/gpu/test/test_main.cpp b/modules/gpu/test/test_main.cpp
index 641f6942f1..05afdcdece 100644
--- a/modules/gpu/test/test_main.cpp
+++ b/modules/gpu/test/test_main.cpp
@@ -109,9 +109,11 @@ int main(int argc, char** argv)
     cvtest::TS::ptr()->init("gpu");
     testing::InitGoogleTest(&argc, argv);
 
-    //cv::CommandLineParser parser(argc, (const char**)argv);
+    const char* keys ="{ nvtest_output_level | nvtest_output_level | none | NVidia test verbosity level }";
 
-    std::string outputLevel = "none";//parser.get<std::string>("nvtest_output_level", "none");
+    cv::CommandLineParser parser(argc, (const char**)argv, keys);
+
+    std::string outputLevel = parser.get<std::string>("nvtest_output_level", "none");
 
     if (outputLevel == "none")
         nvidiaTestOutputLevel = OutputLevelNone;