diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp index d2f4120c12..da0bfe5efd 100644 --- a/modules/gpu/src/cascadeclassifier.cpp +++ b/modules/gpu/src/cascadeclassifier.cpp @@ -290,7 +290,7 @@ namespace cv { namespace gpu { namespace device DevMem2D_ objects, unsigned int* classified); - int connectedConmonents(DevMem2D_ candidates, int groupThreshold, float grouping_eps, unsigned int* nclasses); + int connectedConmonents(DevMem2D_ candidates, DevMem2D_ objects,int groupThreshold, float grouping_eps, unsigned int* nclasses); } }}} @@ -308,6 +308,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp else objects.create(1 , defaultObjSearchNum, CV_32SC4); + GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4); if (maxObjectSize == cv::Size()) maxObjectSize = image.size(); @@ -317,6 +318,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp unsigned int* dclassified; cudaMalloc(&dclassified, sizeof(int)); cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice); + int step; for( double factor = 1; ; factor *= scaleFactor ) { @@ -334,25 +336,22 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp // continue; cv::gpu::resize(image, scaledImageBuffer, scaledImageSize, 0, 0, CV_INTER_LINEAR); - - integral.create(cv::Size(scaledImageSize.width + 1, scaledImageSize.height + 1), CV_32SC1); cv::gpu::integral(scaledImageBuffer, integral); - int step = (factor <= 2.) + 1; + step = (factor <= 2.) + 1; cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat, - integral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, scaleFactor, step, subsetSize, objects, dclassified); + integral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified); } - - cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost); - GpuMat candidates(1, *classified, objects.type(), objects.ptr()); - // std::cout << *classified << " Results: " << cv::Mat(candidates) << std::endl; - if (groupThreshold <= 0 || objects.empty()) return 0; - cv::gpu::device::lbp::connectedConmonents(candidates, groupThreshold, grouping_eps, dclassified); + cv::gpu::device::lbp::connectedConmonents(candidates, objects, groupThreshold, grouping_eps, dclassified); + cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost); cudaSafeCall( cudaDeviceSynchronize() ); - return *classified; + step = *classified; + delete[] classified; + cudaFree(dclassified); + return step; } // ============ old fashioned haar cascade ==============================================// diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu index 5c273b32b9..cd469453b3 100644 --- a/modules/gpu/src/cuda/lbp.cu +++ b/modules/gpu/src/cuda/lbp.cu @@ -51,8 +51,8 @@ namespace cv { namespace gpu { namespace device __global__ void lbp_classify_stump(Stage* stages, int nstages, ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features, const DevMem2Di integral, int workWidth, int workHeight, int clWidth, int clHeight, float scale, int step, int subsetSize, DevMem2D_ objects, unsigned int* n) { - int y = threadIdx.x * scale; - int x = blockIdx.x * scale; + int x = threadIdx.x * step; + int y = blockIdx.x * step; int current_node = 0; int current_leave = 0; @@ -92,7 +92,7 @@ namespace cv { namespace gpu { namespace device } template - __global__ void disjoin(int4* candidates, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses) + __global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses) { using cv::gpu::device::VecTraits; unsigned int tid = threadIdx.x; @@ -119,7 +119,7 @@ namespace cv { namespace gpu { namespace device __syncthreads(); atomicInc((unsigned int*)labels + cls, n); - labels[n - 1] = 0; + *nclasses = 0; int active = labels[tid]; if (active) @@ -152,11 +152,9 @@ namespace cv { namespace gpu { namespace device (n2 > max(3, n1) || n1 < 3) ) break; } - if( j == n) { - // printf("founded gpu %d %d %d %d \n", r1[0], r1[1], r1[2], r1[3]); - candidates[atomicInc((unsigned int*)labels + n -1, n)] = VecTraits::make(r1[0], r1[1], r1[2], r1[3]); + objects[atomicInc(nclasses, n)] = VecTraits::make(r1[0], r1[1], r1[2], r1[3]); } } } @@ -179,11 +177,11 @@ namespace cv { namespace gpu { namespace device workWidth, workHeight, clWidth, clHeight, scale, step, subsetSize, objects, classified); } - int connectedConmonents(DevMem2D_ candidates, int groupThreshold, float grouping_eps, unsigned int* nclasses) + int connectedConmonents(DevMem2D_ candidates, DevMem2D_ objects, int groupThreshold, float grouping_eps, unsigned int* nclasses) { int threads = candidates.cols; int smem_amount = threads * sizeof(int) + threads * sizeof(int4); - disjoin<<<1, threads, smem_amount>>>((int4*)candidates.ptr(), candidates.cols, groupThreshold, grouping_eps, nclasses); + disjoin<<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), candidates.cols, groupThreshold, grouping_eps, nclasses); return 0; } } diff --git a/modules/gpu/src/opencv2/gpu/device/lbp.hpp b/modules/gpu/src/opencv2/gpu/device/lbp.hpp index 2b620b5627..f4ec78b300 100644 --- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp +++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp @@ -65,12 +65,12 @@ namespace lbp{ struct InSameComponint { public: - __device__ __forceinline__ InSameComponint(float _eps) : eps(_eps * 0.5) {} + __device__ __forceinline__ InSameComponint(float _eps) : eps(_eps) {} __device__ __forceinline__ InSameComponint(const InSameComponint& other) : eps(other.eps) {} __device__ __forceinline__ bool operator()(const int4& r1, const int4& r2) const { - double delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w)); + float delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w)) * 0.5; return abs(r1.x - r2.x) <= delta && abs(r1.y - r2.y) <= delta && abs(r1.x + r1.z - r2.x - r2.z) <= delta && abs(r1.y + r1.w - r2.y - r2.w) <= delta; diff --git a/modules/gpu/test/test_objdetect.cpp b/modules/gpu/test/test_objdetect.cpp index 9c6db09dc6..8b49538d40 100644 --- a/modules/gpu/test/test_objdetect.cpp +++ b/modules/gpu/test/test_objdetect.cpp @@ -308,4 +308,57 @@ INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, LBP_Read_classifier, testing::Combine( testing::Values(0) )); +PARAM_TEST_CASE(LBP_classify, cv::gpu::DeviceInfo, int) +{ + cv::gpu::DeviceInfo devInfo; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + cv::gpu::setDevice(devInfo.deviceID()); + } +}; + +TEST_P(LBP_classify, Accuracy) +{ + std::string classifierXmlPath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/lbpcascade_frontalface.xml"; + std::string imagePath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/er.png"; + + cv::CascadeClassifier cpuClassifier(classifierXmlPath); + ASSERT_FALSE(cpuClassifier.empty()); + + cv::Mat image = cv::imread(imagePath); + image = image.colRange(0, image.cols / 2); + cv::Mat grey; + cvtColor(image, grey, CV_BGR2GRAY); + ASSERT_FALSE(image.empty()); + + std::vector rects; + cpuClassifier.detectMultiScale(grey, rects); + cv::Mat markedImage = image.clone(); + + std::vector::iterator it = rects.begin(); + for (; it != rects.end(); ++it) + cv::rectangle(markedImage, *it, cv::Scalar(255, 0, 0, 255)); + + cv::gpu::CascadeClassifier_GPU_LBP gpuClassifier; + ASSERT_TRUE(gpuClassifier.load(classifierXmlPath)); + cv::gpu::GpuMat gpu_rects, buffer; + cv::gpu::GpuMat tested(grey); + int count = gpuClassifier.detectMultiScale(tested, buffer, gpu_rects); + + cv::Mat gpu_f(gpu_rects); + int* gpu_faces = (int*)gpu_f.ptr(); + for (int i = 0; i < count; i++) + { + cv::Rect r(gpu_faces[i * 4],gpu_faces[i * 4 + 1],gpu_faces[i * 4 + 2],gpu_faces[i * 4 + 3]); + cv::rectangle(markedImage, r , cv::Scalar(0, 0, 255, 255)); + } +} + +INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, LBP_classify, testing::Combine( + ALL_DEVICES, + testing::Values(0) + )); + } // namespace