diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp index d340593ddc..594a5f727b 100644 --- a/modules/gpu/src/cascadeclassifier.cpp +++ b/modules/gpu/src/cascadeclassifier.cpp @@ -371,8 +371,9 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp { int acc = level.sFrame.width + 1; float iniScale = level.scale; + cv::Size area = level.workArea; - float step = (float)(1 + (level.scale <= 2.f)); + int step = 1 + (level.scale <= 2.f); int total = 0, prev = 0; @@ -387,19 +388,22 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp gpu::resize(image, src, level.sFrame, 0, 0, CV_INTER_LINEAR); gpu::integralBuffered(src, sint, buff); - total += cvCeil(area.width / step) * cvCeil(area.height / step); - // std::cout << "Total for scale: " << total << " this step contribution " << cvCeil(area.width / step) * cvCeil(area.height / step) << " previous width shift " << prev << " acc " << acc << " scales: " << cvCeil(area.width / step) << std::endl; + // calculate job + int totalWidth = level.workArea.width / step; + // totalWidth = ((totalWidth + WARP_MASK) / WARP_SIZE) << WARP_LOG; - // increment pyr lavel + total += totalWidth * (level.workArea.height / step); + + // go to next pyramide level level = level.next(scaleFactor, image.size(), NxM); area = level.workArea; - step = (float)(1 + (level.scale <= 2.f)); + step = (1 + (level.scale <= 2.f)); prev = acc; acc += level.sFrame.width + 1; } - device::lbp::classifyPyramid(image.cols, image.rows, NxM.width, NxM.height, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, + device::lbp::classifyPyramid(image.cols, image.rows, NxM.width - 1, NxM.height - 1, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr(), integral); } @@ -412,8 +416,6 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp // candidates.copyTo(objects); cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaDeviceSynchronize() ); - // std::cout << classified << " !!!!!!!!!!" << std::endl; - return classified; } diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu index bb65e261ad..bbbe0bf449 100644 --- a/modules/gpu/src/cuda/lbp.cu +++ b/modules/gpu/src/cuda/lbp.cu @@ -240,59 +240,47 @@ namespace cv { namespace gpu { namespace device // stepShift, scale, width_k, sum_prev => y = sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k __global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor, - const int workAmount, int* integral, const int pitch, DevMem2D_ objects, unsigned int* classified) + const int total, int* integral, const int pitch, DevMem2D_ objects, unsigned int* classified) { int ftid = blockIdx.x * blockDim.x + threadIdx.x; - if (ftid >= workAmount ) return; + if (ftid >= total) return; - int sum = 0; - // float scale = 1.0f; - float stepShift = (scale <= 2.f) ? 2.0 : 1.0; - int w = ceilf( ( __float2int_rn(frameW / scale) - windowW + 1) / stepShift); - int h = ceilf( ( __float2int_rn(frameH / scale) - windowH + 1) / stepShift); + int step = (scale <= 2.f); - // if (!ftid) - // printf("!!!!: %d %d", w, h); + int windowsForLine = (__float2int_rn( __fdividef(frameW, scale)) - windowW) >> step; + int stotal = windowsForLine * ( (__float2int_rn( __fdividef(frameH, scale)) - windowH) >> step); + int wshift = 0; - int framTid = ftid; - int i = 0; + int scaleTid = ftid; - while (1) + while (scaleTid >= stotal) { - if (framTid < (w - 1) * (h - 1)) break; - i++; - sum += __float2int_rn(frameW / scale) + 1; - framTid -= w * h; + scaleTid -= stotal; + wshift += __float2int_rn(__fdividef(frameW, scale)) + 1; scale *= factor; - stepShift = (scale <= 2.f) ? 2.0 : 1.0; - int w = ceilf( ( __float2int_rn(frameW / scale) - windowW + 1) / stepShift); - int h = ceilf( ( __float2int_rn(frameH / scale) - windowH + 1) / stepShift); + step = (scale <= 2.f); + windowsForLine = ( ((__float2int_rn(__fdividef(frameW, scale)) - windowW) >> step)); + stotal = windowsForLine * ( (__float2int_rn(__fdividef(frameH, scale)) - windowH) >> step); } - int y = (framTid / w); - int x = (framTid - y * w) * stepShift; - y *= stepShift; - x += sum; + int y = __fdividef(scaleTid, windowsForLine); + int x = scaleTid - y * windowsForLine; - // if (i == 2) - // printf("!!!!!!!!!!!!!! %f %d %d %d\n", windowW * scale, sum, y, x); + x <<= step; + y <<= step; - if (cascade(y, x, integral, pitch)) + if (cascade(y, x + wshift, integral, pitch)) { + if(x >= __float2int_rn(__fdividef(frameW, scale)) - windowW) return; + int4 rect; - rect.x = roundf( (x - sum) * scale); - rect.y = roundf(y * scale); - rect.z = roundf(windowW * scale); - rect.w = roundf(windowH * scale); - - if (rect.x > frameW || rect.y > frameH) return; - // printf("OUTLAUER %d %d %d %d %d %d %d %d %d %f %f\n", x, y, ftid, framTid, rect.x, rect.y, sum, w, h, stepShift, scale); - - // printf("passed: %d %d ---- %d %d %d %d %d\n", y, x, rect.x, rect.y, rect.z, rect.w, sum); + rect.x = __float2int_rn(x * scale); + rect.y = __float2int_rn(y * scale); + rect.z = __float2int_rn(windowW * scale); + rect.w = __float2int_rn(windowH * scale); int res = Emulation::smem::atomicInc(classified, (unsigned int)objects.cols); objects(0, res) = rect; - } }