From 69a0b5dde596840f8face6a4623a9fcffacf5fda Mon Sep 17 00:00:00 2001 From: yao Date: Mon, 15 Apr 2013 17:24:30 +0800 Subject: [PATCH] Add OclCascadeClassifierBuf interface --- modules/ocl/include/opencv2/ocl/ocl.hpp | 38 + modules/ocl/src/haar.cpp | 719 ++++++++++++++---- modules/ocl/src/opencl/haarobjectdetect.cl | 316 ++++---- .../src/opencl/haarobjectdetect_scaled2.cl | 294 ++++--- modules/ocl/test/test_haar.cpp | 102 ++- 5 files changed, 963 insertions(+), 506 deletions(-) diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index 613179f8b6..d00f4bf54e 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -802,6 +802,44 @@ namespace cv int minNeighbors, int flags, CvSize minSize = cvSize(0, 0), CvSize maxSize = cvSize(0, 0)); }; + class CV_EXPORTS OclCascadeClassifierBuf : public cv::CascadeClassifier + { + public: + OclCascadeClassifierBuf() : + m_flags(0), initialized(false), m_scaleFactor(0), buffers(NULL) {} + + ~OclCascadeClassifierBuf() {} + + void detectMultiScale(oclMat &image, CV_OUT std::vector& faces, + double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0, + Size minSize = Size(), Size maxSize = Size()); + void release(); + + private: + void Init(const int rows, const int cols, double scaleFactor, int flags, + const int outputsz, const size_t localThreads[], + CvSize minSize, CvSize maxSize); + void CreateBaseBufs(const int datasize, const int totalclassifier, const int flags, const int outputsz); + void CreateFactorRelatedBufs(const int rows, const int cols, const int flags, + const double scaleFactor, const size_t localThreads[], + CvSize minSize, CvSize maxSize); + void GenResult(CV_OUT std::vector& faces, const std::vector &rectList, const std::vector &rweights); + + int m_rows; + int m_cols; + int m_flags; + int m_loopcount; + int m_nodenum; + bool findBiggestObject; + bool initialized; + double m_scaleFactor; + Size m_minSize; + Size m_maxSize; + vector sizev; + vector scalev; + oclMat gimg1, gsum, gsqsum; + void * buffers; + }; /////////////////////////////// Pyramid ///////////////////////////////////// diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp index 1c727f01f9..c65f68b050 100644 --- a/modules/ocl/src/haar.cpp +++ b/modules/ocl/src/haar.cpp @@ -20,6 +20,7 @@ // Jia Haipeng, jiahaipeng95@gmail.com // Wu Xinglong, wxl370@126.com // Wang Yao, bitwangyaoyao@gmail.com +// Sen Liu, swjtuls1987@126.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -842,15 +843,13 @@ static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade } /* j */ } } + CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemStorage *storage, double scaleFactor, int minNeighbors, int flags, CvSize minSize, CvSize maxSize) { CvHaarClassifierCascade *cascade = oldCascade; - //double alltime = (double)cvGetTickCount(); - //double t = (double)cvGetTickCount(); const double GROUP_EPS = 0.2; - oclMat gtemp, gsum1, gtilted1, gsqsum1, gnormImg, gsumcanny; CvSeq *result_seq = 0; cv::Ptr temp_storage; @@ -861,7 +860,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS int datasize=0; int totalclassifier=0; - //void *out; GpuHidHaarClassifierCascade *gcascade; GpuHidHaarStageClassifier *stage; GpuHidHaarClassifier *classifier; @@ -870,11 +868,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS int *candidate; cl_int status; - // bool doCannyPruning = (flags & CV_HAAR_DO_CANNY_PRUNING) != 0; bool findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0; - // bool roughSearch = (flags & CV_HAAR_DO_ROUGH_SEARCH) != 0; - //double t = 0; if( maxSize.height == 0 || maxSize.width == 0 ) { maxSize.height = gimg.rows; @@ -896,27 +891,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS if( findBiggestObject ) flags &= ~CV_HAAR_SCALE_IMAGE; - //gtemp = oclMat( gimg.rows, gimg.cols, CV_8UC1); - //gsum1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32SC1 ); - //gsqsum1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32FC1 ); - if( !cascade->hid_cascade ) - /*out = (void *)*/gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier); - if( cascade->hid_cascade->has_tilted_features ) - gtilted1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32SC1 ); + gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier); result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), storage ); if( CV_MAT_CN(gimg.type()) > 1 ) { + oclMat gtemp; cvtColor( gimg, gtemp, CV_BGR2GRAY ); gimg = gtemp; } if( findBiggestObject ) flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING); - //t = (double)cvGetTickCount() - t; - //printf( "before if time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) ); if( gimg.cols < minSize.width || gimg.rows < minSize.height ) CV_Error(CV_StsError, "Image too small"); @@ -924,12 +912,9 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS if( (flags & CV_HAAR_SCALE_IMAGE) ) { CvSize winSize0 = cascade->orig_window_size; - //float scalefactor = 1.1f; - //float factor = 1.f; int totalheight = 0; int indexy = 0; CvSize sz; - //t = (double)cvGetTickCount(); vector sizev; vector scalev; for(factor = 1.f;; factor *= scaleFactor) @@ -950,20 +935,15 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS sizev.push_back(sz); scalev.push_back(factor); } - //int flag = 0; oclMat gimg1(gimg.rows, gimg.cols, CV_8UC1); oclMat gsum(totalheight + 4, gimg.cols + 1, CV_32SC1); oclMat gsqsum(totalheight + 4, gimg.cols + 1, CV_32FC1); - //cl_mem cascadebuffer; cl_mem stagebuffer; - //cl_mem classifierbuffer; cl_mem nodebuffer; cl_mem candidatebuffer; cl_mem scaleinfobuffer; - //cl_kernel kernel; - //kernel = openCLGetKernelFromSource(gimg.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade"); cv::Rect roi, roi2; cv::Mat imgroi, imgroisq; cv::ocl::oclMat resizeroi, gimgroi, gimgroisq; @@ -971,18 +951,13 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS size_t blocksize = 8; size_t localThreads[3] = { blocksize, blocksize , 1 }; - size_t globalThreads[3] = { grp_per_CU *((gsum.clCxt)->computeUnits()) *localThreads[0], + size_t globalThreads[3] = { grp_per_CU * gsum.clCxt->computeUnits() *localThreads[0], localThreads[1], 1 }; int outputsz = 256 * globalThreads[0] / localThreads[0]; int loopcount = sizev.size(); detect_piramid_info *scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount); - //t = (double)cvGetTickCount() - t; - // printf( "pre time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) ); - //int *it =scaleinfo; - // t = (double)cvGetTickCount(); - for( int i = 0; i < loopcount; i++ ) { sz = sizev[i]; @@ -992,7 +967,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS resizeroi = gimg1(roi2); gimgroi = gsum(roi); gimgroisq = gsqsum(roi); - //scaleinfo[i].rows = gimgroi.rows; int width = gimgroi.cols - 1 - cascade->orig_window_size.width; int height = gimgroi.rows - 1 - cascade->orig_window_size.height; scaleinfo[i].width_height = (width << 16) | height; @@ -1000,76 +974,40 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS int grpnumperline = (width + localThreads[0] - 1) / localThreads[0]; int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline; - //outputsz +=width*height; scaleinfo[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp; scaleinfo[i].imgoff = gimgroi.offset >> 2; scaleinfo[i].factor = factor; - //printf("rows = %d,ystep = %d,width = %d,height = %d,grpnumperline = %d,totalgrp = %d,imgoff = %d,factor = %f\n", - // scaleinfo[i].rows,scaleinfo[i].ystep,scaleinfo[i].width,scaleinfo[i].height,scaleinfo[i].grpnumperline, - // scaleinfo[i].totalgrp,scaleinfo[i].imgoff,scaleinfo[i].factor); cv::ocl::resize(gimg, resizeroi, Size(sz.width - 1, sz.height - 1), 0, 0, INTER_LINEAR); - //cv::imwrite("D:\\1.jpg",gimg1); cv::ocl::integral(resizeroi, gimgroi, gimgroisq); - //cv::ocl::oclMat chk(sz.height,sz.width,CV_32SC1),chksq(sz.height,sz.width,CV_32FC1); - //cv::ocl::integral(gimg1, chk, chksq); - //double r = cv::norm(chk,gimgroi,NORM_INF); - //if(r > std::numeric_limits::epsilon()) - //{ - // printf("failed"); - //} indexy += sz.height; } - //int ystep = factor > 2 ? 1 : 2; - // t = (double)cvGetTickCount() - t; - //printf( "resize integral time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) ); - //t = (double)cvGetTickCount(); + gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade; stage = (GpuHidHaarStageClassifier *)(gcascade + 1); classifier = (GpuHidHaarClassifier *)(stage + gcascade->count); node = (GpuHidHaarTreeNode *)(classifier->node); - //int m,n; - //m = (gsum.cols - 1 - cascade->orig_window_size.width + ystep - 1)/ystep; - //n = (gsum.rows - 1 - cascade->orig_window_size.height + ystep - 1)/ystep; - //int counter = m*n; - int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) - sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode); - //if(flag == 0){ - candidate = (int *)malloc(4 * sizeof(int) * outputsz); - //memset((char*)candidate,0,4*sizeof(int)*outputsz); - gpuSetImagesForHaarClassifierCascade( cascade,/* &sum1, &sqsum1, _tilted,*/ 1., gsum.step / 4 ); - //cascadebuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifierCascade),NULL,&status); - //openCLVerifyCall(status); - //openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,cascadebuffer,1,0,sizeof(GpuHidHaarClassifierCascade),gcascade,0,NULL,NULL)); + candidate = (int *)malloc(4 * sizeof(int) * outputsz); + + gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 ); stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count); - //openCLVerifyCall(status); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL)); - - //classifierbuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifier)*totalclassifier,NULL,&status); - //status = clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,classifierbuffer,1,0,sizeof(GpuHidHaarClassifier)*totalclassifier,classifier,0,NULL,NULL); + cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue(); + openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL)); nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode)); - //openCLVerifyCall(status); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), nodebuffer, 1, 0, - nodenum * sizeof(GpuHidHaarTreeNode), + + openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0, nodenum * sizeof(GpuHidHaarTreeNode), node, 0, NULL, NULL)); candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY, 4 * sizeof(int) * outputsz); - //openCLVerifyCall(status); + scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount); - //openCLVerifyCall(status); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL)); - //flag = 1; - //} + openCLSafeCall(clEnqueueWriteBuffer(qu, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL)); - //t = (double)cvGetTickCount() - t; - //printf( "update time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) ); - - //size_t globalThreads[3] = { counter+blocksize*blocksize-counter%(blocksize*blocksize),1,1}; - //t = (double)cvGetTickCount(); int startstage = 0; int endstage = gcascade->count; int startnode = 0; @@ -1087,11 +1025,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS pq.s[3] = gcascade->pq3; float correction = gcascade->inv_window_area; - //int grpnumperline = ((m + localThreads[0] - 1) / localThreads[0]); - //int totalgrp = ((n + localThreads[1] - 1) / localThreads[1])*grpnumperline; - // openCLVerifyKernel(gsum.clCxt, kernel, &blocksize, globalThreads, localThreads); - //openCLSafeCall(clSetKernelArg(kernel,argcount++,sizeof(cl_mem),(void*)&cascadebuffer)); - vector > args; args.push_back ( make_pair(sizeof(cl_mem) , (void *)&stagebuffer )); args.push_back ( make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer )); @@ -1111,28 +1044,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction )); openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1); - //t = (double)cvGetTickCount() - t; - //printf( "detection time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) ); - //t = (double)cvGetTickCount(); - //openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->impl->clCmdQueue, candidatebuffer, 1, 0, 4 * sizeof(int)*outputsz, candidate, 0, NULL, NULL)); + openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz ); for(int i = 0; i < outputsz; i++) if(candidate[4 * i + 2] != 0) - allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], candidate[4 * i + 2], candidate[4 * i + 3])); - // t = (double)cvGetTickCount() - t; - //printf( "post time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) ); - //t = (double)cvGetTickCount(); + allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], + candidate[4 * i + 2], candidate[4 * i + 3])); + free(scaleinfo); free(candidate); - //openCLSafeCall(clReleaseMemObject(cascadebuffer)); openCLSafeCall(clReleaseMemObject(stagebuffer)); openCLSafeCall(clReleaseMemObject(scaleinfobuffer)); openCLSafeCall(clReleaseMemObject(nodebuffer)); openCLSafeCall(clReleaseMemObject(candidatebuffer)); - // openCLSafeCall(clReleaseKernel(kernel)); - //t = (double)cvGetTickCount() - t; - //printf( "release time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) ); } else { @@ -1150,7 +1075,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS classifier = (GpuHidHaarClassifier *)(stage + gcascade->count); node = (GpuHidHaarTreeNode *)(classifier->node); cl_mem stagebuffer; - //cl_mem classifierbuffer; cl_mem nodebuffer; cl_mem candidatebuffer; cl_mem scaleinfobuffer; @@ -1187,24 +1111,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS size_t blocksize = 8; size_t localThreads[3] = { blocksize, blocksize , 1 }; size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->computeUnits() *localThreads[0], - localThreads[1], 1 - }; + localThreads[1], 1 }; int outputsz = 256 * globalThreads[0] / localThreads[0]; int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) - sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode); nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode)); - //openCLVerifyCall(status); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), nodebuffer, 1, 0, + cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue(); + openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0, nodenum * sizeof(GpuHidHaarTreeNode), node, 0, NULL, NULL)); cl_mem newnodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_WRITE, loopcount * nodenum * sizeof(GpuHidHaarTreeNode)); int startstage = 0; int endstage = gcascade->count; - //cl_kernel kernel; - //kernel = openCLGetKernelFromSource(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2"); - //cl_kernel kernel2 = openCLGetKernelFromSource(gimg.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier"); for(int i = 0; i < loopcount; i++) { sz = sizev[i]; @@ -1223,7 +1143,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS int height = (gsum.rows - 1 - sz.height + ystep - 1) / ystep; int grpnumperline = (width + localThreads[0] - 1) / localThreads[0]; int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline; - //outputsz +=width*height; + scaleinfo[i].width_height = (width << 16) | height; scaleinfo[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp; scaleinfo[i].imgoff = 0; @@ -1241,28 +1161,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS size_t globalThreads2[3] = {nodenum, 1, 1}; openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1); - - //clEnqueueNDRangeKernel(gsum.clCxt->impl->clCmdQueue, kernel2, 1, NULL, globalThreads2, 0, 0, NULL, NULL); - //clFinish(gsum.clCxt->impl->clCmdQueue); } - //clReleaseKernel(kernel2); + int step = gsum.step / 4; int startnode = 0; int splitstage = 3; - int splitnode = stage[0].count + stage[1].count + stage[2].count; stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count); - //openCLVerifyCall(status); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL)); + openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL)); candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, 4 * sizeof(int) * outputsz); - //openCLVerifyCall(status); scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount); - //openCLVerifyCall(status); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL)); + openCLSafeCall(clEnqueueWriteBuffer(qu, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL)); pbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_int4) * loopcount); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL)); + openCLSafeCall(clEnqueueWriteBuffer(qu, pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL)); correctionbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_float) * loopcount); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL)); - //int argcount = 0; + openCLSafeCall(clEnqueueWriteBuffer(qu, correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL)); vector > args; args.push_back ( make_pair(sizeof(cl_mem) , (void *)&stagebuffer )); @@ -1271,22 +1183,21 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data )); args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data )); args.push_back ( make_pair(sizeof(cl_mem) , (void *)&candidatebuffer )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols )); args.push_back ( make_pair(sizeof(cl_int) , (void *)&step )); args.push_back ( make_pair(sizeof(cl_int) , (void *)&loopcount )); args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage )); args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage )); args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage )); args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode )); - args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitnode )); args.push_back ( make_pair(sizeof(cl_mem) , (void *)&pbuffer )); args.push_back ( make_pair(sizeof(cl_mem) , (void *)&correctionbuffer )); args.push_back ( make_pair(sizeof(cl_int) , (void *)&nodenum )); - openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1); - //openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->clCmdQueue,candidatebuffer,1,0,4*sizeof(int)*outputsz,candidate,0,NULL,NULL)); - candidate = (int *)clEnqueueMapBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int), 0, 0, 0, &status); + candidate = (int *)clEnqueueMapBuffer(qu, candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, &status); for(int i = 0; i < outputsz; i++) { @@ -1297,7 +1208,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS free(scaleinfo); free(p); free(correction); - clEnqueueUnmapMemObject((cl_command_queue)gsum.clCxt->oclCommandQueue(), candidatebuffer, candidate, 0, 0, 0); + clEnqueueUnmapMemObject(qu, candidatebuffer, candidate, 0, 0, 0); openCLSafeCall(clReleaseMemObject(stagebuffer)); openCLSafeCall(clReleaseMemObject(scaleinfobuffer)); openCLSafeCall(clReleaseMemObject(nodebuffer)); @@ -1306,21 +1217,17 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS openCLSafeCall(clReleaseMemObject(pbuffer)); openCLSafeCall(clReleaseMemObject(correctionbuffer)); } - //t = (double)cvGetTickCount() ; + cvFree(&cascade->hid_cascade); - // printf("%d\n",globalcounter); rectList.resize(allCandidates.size()); if(!allCandidates.empty()) std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin()); - //cout << "count = " << rectList.size()<< endl; - if( minNeighbors != 0 || findBiggestObject ) groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS); else rweights.resize(rectList.size(), 0); - if( findBiggestObject && rectList.size() ) { CvAvgComp result_comp = {{0, 0, 0, 0}, 0}; @@ -1346,13 +1253,565 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS cvSeqPush( result_seq, &c ); } } - //t = (double)cvGetTickCount() - t; - //printf( "get face time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) ); - //alltime = (double)cvGetTickCount() - alltime; - //printf( "all time = %g ms\n", alltime/((double)cvGetTickFrequency()*1000.) ); + return result_seq; } +struct OclBuffers +{ + cl_mem stagebuffer; + cl_mem nodebuffer; + cl_mem candidatebuffer; + cl_mem scaleinfobuffer; + cl_mem pbuffer; + cl_mem correctionbuffer; + cl_mem newnodebuffer; +}; + +struct getRect +{ + Rect operator()(const CvAvgComp &e) const + { + return e.rect; + } +}; + +void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std::vector& faces, + double scaleFactor, int minNeighbors, int flags, + Size minSize, Size maxSize) +{ + int blocksize = 8; + int grp_per_CU = 12; + size_t localThreads[3] = { blocksize, blocksize, 1 }; + size_t globalThreads[3] = { grp_per_CU * Context::getContext()->computeUnits() * localThreads[0], + localThreads[1], + 1 }; + int outputsz = 256 * globalThreads[0] / localThreads[0]; + + Init(gimg.rows, gimg.cols, scaleFactor, flags, outputsz, localThreads, minSize, maxSize); + + const double GROUP_EPS = 0.2; + + cv::ConcurrentRectVector allCandidates; + std::vector rectList; + std::vector rweights; + + CvHaarClassifierCascade *cascade = oldCascade; + GpuHidHaarClassifierCascade *gcascade; + GpuHidHaarStageClassifier *stage; + GpuHidHaarClassifier *classifier; + GpuHidHaarTreeNode *node; + + if( CV_MAT_DEPTH(gimg.type()) != CV_8U ) + CV_Error( CV_StsUnsupportedFormat, "Only 8-bit images are supported" ); + + if( CV_MAT_CN(gimg.type()) > 1 ) + { + oclMat gtemp; + cvtColor( gimg, gtemp, CV_BGR2GRAY ); + gimg = gtemp; + } + + int *candidate; + + if( (flags & CV_HAAR_SCALE_IMAGE) ) + { + int indexy = 0; + CvSize sz; + + cv::Rect roi, roi2; + cv::Mat imgroi, imgroisq; + cv::ocl::oclMat resizeroi, gimgroi, gimgroisq; + + for( int i = 0; i < m_loopcount; i++ ) + { + sz = sizev[i]; + roi = Rect(0, indexy, sz.width, sz.height); + roi2 = Rect(0, 0, sz.width - 1, sz.height - 1); + resizeroi = gimg1(roi2); + gimgroi = gsum(roi); + gimgroisq = gsqsum(roi); + + cv::ocl::resize(gimg, resizeroi, Size(sz.width - 1, sz.height - 1), 0, 0, INTER_LINEAR); + cv::ocl::integral(resizeroi, gimgroi, gimgroisq); + indexy += sz.height; + } + + gcascade = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade); + stage = (GpuHidHaarStageClassifier *)(gcascade + 1); + classifier = (GpuHidHaarClassifier *)(stage + gcascade->count); + node = (GpuHidHaarTreeNode *)(classifier->node); + + gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 ); + + cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue(); + openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0, + sizeof(GpuHidHaarStageClassifier) * gcascade->count, + stage, 0, NULL, NULL)); + + openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0, + m_nodenum * sizeof(GpuHidHaarTreeNode), + node, 0, NULL, NULL)); + + int startstage = 0; + int endstage = gcascade->count; + int startnode = 0; + int pixelstep = gsum.step / 4; + int splitstage = 3; + int splitnode = stage[0].count + stage[1].count + stage[2].count; + cl_int4 p, pq; + p.s[0] = gcascade->p0; + p.s[1] = gcascade->p1; + p.s[2] = gcascade->p2; + p.s[3] = gcascade->p3; + pq.s[0] = gcascade->pq0; + pq.s[1] = gcascade->pq1; + pq.s[2] = gcascade->pq2; + pq.s[3] = gcascade->pq3; + float correction = gcascade->inv_window_area; + + vector > args; + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->nodebuffer )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&pixelstep )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitnode )); + args.push_back ( make_pair(sizeof(cl_int4) , (void *)&p )); + args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq )); + args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction )); + + openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1); + + candidate = (int *)malloc(4 * sizeof(int) * outputsz); + memset(candidate, 0, 4 * sizeof(int) * outputsz); + openCLReadBuffer( gsum.clCxt, ((OclBuffers *)buffers)->candidatebuffer, candidate, 4 * sizeof(int)*outputsz ); + + for(int i = 0; i < outputsz; i++) + if(candidate[4 * i + 2] != 0) + allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], + candidate[4 * i + 2], candidate[4 * i + 3])); + + free((void *)candidate); + candidate = NULL; + } + else + { + cv::ocl::integral(gimg, gsum, gsqsum); + + gpuSetHaarClassifierCascade(cascade); + + gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade; + stage = (GpuHidHaarStageClassifier *)(gcascade + 1); + classifier = (GpuHidHaarClassifier *)(stage + gcascade->count); + node = (GpuHidHaarTreeNode *)(classifier->node); + + cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue(); + openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0, + m_nodenum * sizeof(GpuHidHaarTreeNode), + node, 0, NULL, NULL)); + + cl_int4 *p = (cl_int4 *)malloc(sizeof(cl_int4) * m_loopcount); + float *correction = (float *)malloc(sizeof(float) * m_loopcount); + int startstage = 0; + int endstage = gcascade->count; + double factor; + for(int i = 0; i < m_loopcount; i++) + { + factor = scalev[i]; + int equRect_x = (int)(factor * gcascade->p0 + 0.5); + int equRect_y = (int)(factor * gcascade->p1 + 0.5); + int equRect_w = (int)(factor * gcascade->p3 + 0.5); + int equRect_h = (int)(factor * gcascade->p2 + 0.5); + p[i].s[0] = equRect_x; + p[i].s[1] = equRect_y; + p[i].s[2] = equRect_x + equRect_w; + p[i].s[3] = equRect_y + equRect_h; + correction[i] = 1. / (equRect_w * equRect_h); + int startnodenum = m_nodenum * i; + float factor2 = (float)factor; + + vector > args1; + args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->nodebuffer )); + args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer )); + args1.push_back ( make_pair(sizeof(cl_float) , (void *)&factor2 )); + args1.push_back ( make_pair(sizeof(cl_float) , (void *)&correction[i] )); + args1.push_back ( make_pair(sizeof(cl_int) , (void *)&startnodenum )); + + size_t globalThreads2[3] = {m_nodenum, 1, 1}; + + openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1); + } + + int step = gsum.step / 4; + int startnode = 0; + int splitstage = 3; + openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL)); + openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->pbuffer, 1, 0, sizeof(cl_int4)*m_loopcount, p, 0, NULL, NULL)); + openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->correctionbuffer, 1, 0, sizeof(cl_float)*m_loopcount, correction, 0, NULL, NULL)); + + vector > args; + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&step )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->pbuffer )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->correctionbuffer )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_nodenum )); + + openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1); + + candidate = (int *)clEnqueueMapBuffer(qu, ((OclBuffers *)buffers)->candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, NULL); + + for(int i = 0; i < outputsz; i++) + { + if(candidate[4 * i + 2] != 0) + allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], + candidate[4 * i + 2], candidate[4 * i + 3])); + } + + free(p); + free(correction); + clEnqueueUnmapMemObject(qu, ((OclBuffers *)buffers)->candidatebuffer, candidate, 0, 0, 0); + } + + rectList.resize(allCandidates.size()); + if(!allCandidates.empty()) + std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin()); + + if( minNeighbors != 0 || findBiggestObject ) + groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS); + else + rweights.resize(rectList.size(), 0); + + GenResult(faces, rectList, rweights); +} + +void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols, + double scaleFactor, int flags, + const int outputsz, const size_t localThreads[], + CvSize minSize, CvSize maxSize) +{ + CvHaarClassifierCascade *cascade = oldCascade; + + if( !CV_IS_HAAR_CLASSIFIER(cascade) ) + CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier cascade" ); + + if( scaleFactor <= 1 ) + CV_Error( CV_StsOutOfRange, "scale factor must be > 1" ); + + if( cols < minSize.width || rows < minSize.height ) + CV_Error(CV_StsError, "Image too small"); + + int datasize=0; + int totalclassifier=0; + + if( !cascade->hid_cascade ) + gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier); + + if( maxSize.height == 0 || maxSize.width == 0 ) + { + maxSize.height = rows; + maxSize.width = cols; + } + + findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0; + if( findBiggestObject ) + flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING); + + CreateBaseBufs(datasize, totalclassifier, flags, outputsz); + CreateFactorRelatedBufs(rows, cols, flags, scaleFactor, localThreads, minSize, maxSize); + + m_scaleFactor = scaleFactor; + m_rows = rows; + m_cols = cols; + m_flags = flags; + m_minSize = minSize; + m_maxSize = maxSize; + + initialized = true; +} + +void cv::ocl::OclCascadeClassifierBuf::CreateBaseBufs(const int datasize, const int totalclassifier, + const int flags, const int outputsz) +{ + if (!initialized) + { + buffers = malloc(sizeof(OclBuffers)); + + size_t tempSize = + sizeof(GpuHidHaarStageClassifier) * ((GpuHidHaarClassifierCascade *)oldCascade->hid_cascade)->count; + m_nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) - tempSize - sizeof(GpuHidHaarClassifier) * totalclassifier) + / sizeof(GpuHidHaarTreeNode); + + ((OclBuffers *)buffers)->stagebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, tempSize); + ((OclBuffers *)buffers)->nodebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, m_nodenum * sizeof(GpuHidHaarTreeNode)); + } + + if (initialized + && ((m_flags & CV_HAAR_SCALE_IMAGE) ^ (flags & CV_HAAR_SCALE_IMAGE))) + { + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer)); + } + + if (flags & CV_HAAR_SCALE_IMAGE) + { + ((OclBuffers *)buffers)->candidatebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), + CL_MEM_WRITE_ONLY, + 4 * sizeof(int) * outputsz); + } + else + { + ((OclBuffers *)buffers)->candidatebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), + CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, + 4 * sizeof(int) * outputsz); + } +} + +void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs( + const int rows, const int cols, const int flags, + const double scaleFactor, const size_t localThreads[], + CvSize minSize, CvSize maxSize) +{ + if (initialized) + { + if ((m_flags & CV_HAAR_SCALE_IMAGE) && !(flags & CV_HAAR_SCALE_IMAGE)) + { + gimg1.release(); + gsum.release(); + gsqsum.release(); + } + else if (!(m_flags & CV_HAAR_SCALE_IMAGE) && (flags & CV_HAAR_SCALE_IMAGE)) + { + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer)); + } + else if ((m_flags & CV_HAAR_SCALE_IMAGE) && (flags & CV_HAAR_SCALE_IMAGE)) + { + if (fabs(m_scaleFactor - scaleFactor) < 1e-6 + && (rows == m_rows && cols == m_cols) + && (minSize.width == m_minSize.width) + && (minSize.height == m_minSize.height) + && (maxSize.width == m_maxSize.width) + && (maxSize.height == m_maxSize.height)) + { + return; + } + } + else + { + if (fabs(m_scaleFactor - scaleFactor) < 1e-6 + && (rows == m_rows && cols == m_cols) + && (minSize.width == m_minSize.width) + && (minSize.height == m_minSize.height) + && (maxSize.width == m_maxSize.width) + && (maxSize.height == m_maxSize.height)) + { + return; + } + else + { + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer)); + } + } + } + + int loopcount; + int indexy = 0; + int totalheight = 0; + double factor; + Rect roi; + CvSize sz; + CvSize winSize0 = oldCascade->orig_window_size; + detect_piramid_info *scaleinfo; + if (flags & CV_HAAR_SCALE_IMAGE) + { + for(factor = 1.f;; factor *= scaleFactor) + { + CvSize winSize = { cvRound(winSize0.width * factor), cvRound(winSize0.height * factor) }; + sz.width = cvRound( cols / factor ) + 1; + sz.height = cvRound( rows / factor ) + 1; + CvSize sz1 = { sz.width - winSize0.width - 1, sz.height - winSize0.height - 1 }; + + if( sz1.width <= 0 || sz1.height <= 0 ) + break; + if( winSize.width > maxSize.width || winSize.height > maxSize.height ) + break; + if( winSize.width < minSize.width || winSize.height < minSize.height ) + continue; + + totalheight += sz.height; + sizev.push_back(sz); + scalev.push_back(static_cast(factor)); + } + + loopcount = sizev.size(); + gimg1.create(rows, cols, CV_8UC1); + gsum.create(totalheight + 4, cols + 1, CV_32SC1); + gsqsum.create(totalheight + 4, cols + 1, CV_32FC1); + + scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount); + for( int i = 0; i < loopcount; i++ ) + { + sz = sizev[i]; + roi = Rect(0, indexy, sz.width, sz.height); + int width = sz.width - 1 - oldCascade->orig_window_size.width; + int height = sz.height - 1 - oldCascade->orig_window_size.height; + int grpnumperline = (width + localThreads[0] - 1) / localThreads[0]; + int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline; + + ((detect_piramid_info *)scaleinfo)[i].width_height = (width << 16) | height; + ((detect_piramid_info *)scaleinfo)[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp; + ((detect_piramid_info *)scaleinfo)[i].imgoff = gsum(roi).offset >> 2; + ((detect_piramid_info *)scaleinfo)[i].factor = scalev[i]; + + indexy += sz.height; + } + } + else + { + for(factor = 1; + cvRound(factor * winSize0.width) < cols - 10 && cvRound(factor * winSize0.height) < rows - 10; + factor *= scaleFactor) + { + CvSize winSize = { cvRound( winSize0.width * factor ), cvRound( winSize0.height * factor ) }; + if( winSize.width < minSize.width || winSize.height < minSize.height ) + { + continue; + } + sizev.push_back(winSize); + scalev.push_back(factor); + } + + loopcount = scalev.size(); + if(loopcount == 0) + { + loopcount = 1; + sizev.push_back(minSize); + scalev.push_back( min(cvRound(minSize.width / winSize0.width), cvRound(minSize.height / winSize0.height)) ); + } + + ((OclBuffers *)buffers)->pbuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, + sizeof(cl_int4) * loopcount); + ((OclBuffers *)buffers)->correctionbuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, + sizeof(cl_float) * loopcount); + ((OclBuffers *)buffers)->newnodebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_WRITE, + loopcount * m_nodenum * sizeof(GpuHidHaarTreeNode)); + + scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount); + for( int i = 0; i < loopcount; i++ ) + { + sz = sizev[i]; + factor = scalev[i]; + int ystep = cvRound(std::max(2., factor)); + int width = (cols - 1 - sz.width + ystep - 1) / ystep; + int height = (rows - 1 - sz.height + ystep - 1) / ystep; + int grpnumperline = (width + localThreads[0] - 1) / localThreads[0]; + int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline; + + ((detect_piramid_info *)scaleinfo)[i].width_height = (width << 16) | height; + ((detect_piramid_info *)scaleinfo)[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp; + ((detect_piramid_info *)scaleinfo)[i].imgoff = 0; + ((detect_piramid_info *)scaleinfo)[i].factor = factor; + } + } + + if (loopcount != m_loopcount) + { + if (initialized) + { + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer)); + } + ((OclBuffers *)buffers)->scaleinfobuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount); + } + + openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)cv::ocl::Context::getContext()->oclCommandQueue(), ((OclBuffers *)buffers)->scaleinfobuffer, 1, 0, + sizeof(detect_piramid_info)*loopcount, + scaleinfo, 0, NULL, NULL)); + free(scaleinfo); + + m_loopcount = loopcount; +} + +void cv::ocl::OclCascadeClassifierBuf::GenResult(CV_OUT std::vector& faces, + const std::vector &rectList, + const std::vector &rweights) +{ + CvSeq *result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), cvCreateMemStorage(0) ); + + if( findBiggestObject && rectList.size() ) + { + CvAvgComp result_comp = {{0, 0, 0, 0}, 0}; + + for( size_t i = 0; i < rectList.size(); i++ ) + { + cv::Rect r = rectList[i]; + if( r.area() > cv::Rect(result_comp.rect).area() ) + { + result_comp.rect = r; + result_comp.neighbors = rweights[i]; + } + } + cvSeqPush( result_seq, &result_comp ); + } + else + { + for( size_t i = 0; i < rectList.size(); i++ ) + { + CvAvgComp c; + c.rect = rectList[i]; + c.neighbors = rweights[i]; + cvSeqPush( result_seq, &c ); + } + } + + vector vecAvgComp; + Seq(result_seq).copyTo(vecAvgComp); + faces.resize(vecAvgComp.size()); + std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect()); +} + +void cv::ocl::OclCascadeClassifierBuf::release() +{ + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->stagebuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->nodebuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer)); + + if( (m_flags & CV_HAAR_SCALE_IMAGE) ) + { + cvFree(&oldCascade->hid_cascade); + } + else + { + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer)); + } + + free(buffers); + buffers = NULL; +} #ifndef _MAX_PATH #define _MAX_PATH 1024 diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl index 9e468b07f5..c7ed96a342 100644 --- a/modules/ocl/src/opencl/haarobjectdetect.cl +++ b/modules/ocl/src/opencl/haarobjectdetect.cl @@ -112,7 +112,7 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade } GpuHidHaarClassifierCascade; -__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(//constant GpuHidHaarClassifierCascade * cascade, +__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade( global GpuHidHaarStageClassifier * stagecascadeptr, global int4 * info, global GpuHidHaarTreeNode * nodeptr, @@ -128,12 +128,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa const int splitnode, const int4 p, const int4 pq, - const float correction - //const int width, - //const int height, - //const int grpnumperline, - //const int totalgrp -) + const float correction) { int grpszx = get_local_size(0); int grpszy = get_local_size(1); @@ -145,13 +140,8 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa int lcl_sz = mul24(grpszx,grpszy); int lcl_id = mad24(lclidy,grpszx,lclidx); - //assume lcl_sz == 256 or 128 or 64 - //int lcl_sz_shift = (lcl_sz == 256) ? 8 : 7; - //lcl_sz_shift = (lcl_sz == 64) ? 6 : lcl_sz_shift; __local int lclshare[1024]; - -#define OFF 0 - __local int* lcldata = lclshare + OFF;//for save win data + __local int* lcldata = lclshare;//for save win data __local int* glboutindex = lcldata + 28*28;//for save global out index __local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel __local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel @@ -181,7 +171,6 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa int totalgrp = scaleinfo1.y & 0xffff; int imgoff = scaleinfo1.z; float factor = as_float(scaleinfo1.w); - //int ystep =1;// factor > 2.0 ? 1 : 2; __global const int * sum = sum1 + imgoff; __global const float * sqsum = sqsum1 + imgoff; @@ -191,8 +180,6 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa int grpidx = grploop - mul24(grpidy, grpnumperline); int x = mad24(grpidx,grpszx,lclidx); int y = mad24(grpidy,grpszy,lclidy); - //candidate_result.x = convert_int_rtn(x*factor); - //candidate_result.y = convert_int_rtn(y*factor); int grpoffx = x-lclidx; int grpoffy = y-lclidy; @@ -211,14 +198,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa int4 data = *(__global int4*)&sum[glb_off]; int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2); -#if OFF - lcldata[lcl_off] = data.x; - lcldata[lcl_off+1] = data.y; - lcldata[lcl_off+2] = data.z; - lcldata[lcl_off+3] = data.w; -#else vstore4(data, 0, &lcldata[lcl_off]); -#endif } lcloutindex[lcl_id] = 0; @@ -231,184 +211,170 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa int lcl_off = mad24(lclidy,readwidth,lclidx); int4 cascadeinfo1, cascadeinfo2; cascadeinfo1 = p; - cascadeinfo2 = pq;// + mad24(y, pixelstep, x); + cascadeinfo2 = pq; + cascadeinfo1.x +=lcl_off; + cascadeinfo1.z +=lcl_off; + mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] - + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)]) + *correction; - //if((x < width) && (y < height)) + int p_offset = mad24(y, pixelstep, x); + + cascadeinfo2.x +=p_offset; + cascadeinfo2.z +=p_offset; + variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] - + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)]; + + variance_norm_factor = variance_norm_factor * correction - mean * mean; + variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f; + + for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ ) { - cascadeinfo1.x +=lcl_off; - cascadeinfo1.z +=lcl_off; - mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] - - lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)]) - *correction; + float stage_sum = 0.f; + int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); + float stagethreshold = as_float(stageinfo.y); + for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ ) + { + __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter); - int p_offset = mad24(y, pixelstep, x); + int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0])); + int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); + int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); + float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); + float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0])); + float nodethreshold = w.w * variance_norm_factor; - cascadeinfo2.x +=p_offset; - cascadeinfo2.z +=p_offset; - variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] - - sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)]; + info1.x +=lcl_off; + info1.z +=lcl_off; + info2.x +=lcl_off; + info2.z +=lcl_off; - variance_norm_factor = variance_norm_factor * correction - mean * mean; - variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f; - //if( cascade->is_stump_based ) - //{ - for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ ) + float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - + lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; + + classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - + lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; + + info3.x +=lcl_off; + info3.z +=lcl_off; + classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - + lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; + + stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; + nodecounter++; + } + + result = (stage_sum >= stagethreshold); + } + + if(result && (x < width) && (y < height)) + { + int queueindex = atomic_inc(lclcount); + lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx; + lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor); + } + barrier(CLK_LOCAL_MEM_FENCE); + int queuecount = lclcount[0]; + barrier(CLK_LOCAL_MEM_FENCE); + nodecounter = splitnode; + for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++) + { + lclcount[0]=0; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); + float stagethreshold = as_float(stageinfo.y); + + int perfscale = queuecount > 4 ? 3 : 2; + int queuecount_loop = (queuecount + (1<> perfscale; + int lcl_compute_win = lcl_sz >> perfscale; + int lcl_compute_win_id = (lcl_id >>(6-perfscale)); + int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale); + int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale)); + for(int queueloop=0; queueloop>16),readwidth,temp_coord & 0xffff); + + if(lcl_compute_win_id < queuecount) { - __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter); - int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0])); - int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); - int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); - float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); - float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0])); - float nodethreshold = w.w * variance_norm_factor; + int tempnodecounter = lcl_compute_id; + float part_sum = 0.f; + for(int lcl_loop=0; lcl_loopp[0][0])); + int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); + int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); + float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); + float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0])); + float nodethreshold = w.w * variance_norm_factor; - float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - - lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; + info1.x +=queue_pixel; + info1.z +=queue_pixel; + info2.x +=queue_pixel; + info2.z +=queue_pixel; + + float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - + lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; - classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - - lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; + classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - + lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; + info3.x +=queue_pixel; + info3.z +=queue_pixel; + classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - + lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; - //if((info3.z - info3.x) && (!stageinfo.z)) - //{ - info3.x +=lcl_off; - info3.z +=lcl_off; - classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - - lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; - //} - stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; - nodecounter++; + part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; + tempnodecounter +=lcl_compute_win; + }//end for(int lcl_loop=0;lcl_loop= stagethreshold); - } - - if(result && (x < width) && (y < height)) - { - int queueindex = atomic_inc(lclcount); - lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx; - lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor); - } - barrier(CLK_LOCAL_MEM_FENCE); - int queuecount = lclcount[0]; - barrier(CLK_LOCAL_MEM_FENCE); - nodecounter = splitnode; - for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++) - { - //barrier(CLK_LOCAL_MEM_FENCE); - //if(lcl_id == 0) - lclcount[0]=0; barrier(CLK_LOCAL_MEM_FENCE); - - int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); - float stagethreshold = as_float(stageinfo.y); - - int perfscale = queuecount > 4 ? 3 : 2; - int queuecount_loop = (queuecount + (1<> perfscale; - int lcl_compute_win = lcl_sz >> perfscale; - int lcl_compute_win_id = (lcl_id >>(6-perfscale)); - int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale); - int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale)); - for(int queueloop=0; queueloop>16),readwidth,temp_coord & 0xffff); - - //barrier(CLK_LOCAL_MEM_FENCE); - if(lcl_compute_win_id < queuecount) + for(int i=0; ip[0][0])); - int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); - int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); - float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); - float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0])); - float nodethreshold = w.w * variance_norm_factor; - - info1.x +=queue_pixel; - info1.z +=queue_pixel; - info2.x +=queue_pixel; - info2.z +=queue_pixel; - - float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - - lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; - - - classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - - lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; - //if((info3.z - info3.x) && (!stageinfo.z)) - //{ - info3.x +=queue_pixel; - info3.z +=queue_pixel; - classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - - lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; - //} - part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; - tempnodecounter +=lcl_compute_win; - }//end for(int lcl_loop=0;lcl_loop= stagethreshold && (lcl_compute_id==0)) { - for(int i=0; i= stagethreshold && (lcl_compute_id==0)) - { - int queueindex = atomic_inc(lclcount); - lcloutindex[queueindex<<1] = temp_coord; - lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor); - } - lcl_compute_win_id +=(1<0;stageloop++) - //barrier(CLK_LOCAL_MEM_FENCE); - if(lcl_id> 16)); - temp = glboutindex[0]; - int4 candidate_result; - candidate_result.zw = (int2)convert_int_rtn(factor*20.f); - candidate_result.x = convert_int_rtn(x*factor); - candidate_result.y = convert_int_rtn(y*factor); - atomic_inc(glboutindex); - candidate[outputoff+temp+lcl_id] = candidate_result; - } + }//end for(int queueloop=0;queueloop0;stageloop++) + + if(lcl_id> 16)); + temp = glboutindex[0]; + int4 candidate_result; + candidate_result.zw = (int2)convert_int_rtn(factor*20.f); + candidate_result.x = convert_int_rtn(x*factor); + candidate_result.y = convert_int_rtn(y*factor); + atomic_inc(glboutindex); + candidate[outputoff+temp+lcl_id] = candidate_result; + } + barrier(CLK_LOCAL_MEM_FENCE); }//end for(int grploop=grpidx;grploop> 16; + int height = scaleinfo1.x & 0xffff; + int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16; + int totalgrp = scaleinfo1.y & 0xffff; + float factor = as_float(scaleinfo1.w); + float correction_t = correction[scalei]; + int ystep = (int)(max(2.0f, factor) + 0.5f); - for (int scalei = 0; scalei < loopcount; scalei++) + for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx) { - int4 scaleinfo1; - scaleinfo1 = info[scalei]; - int width = (scaleinfo1.x & 0xffff0000) >> 16; - int height = scaleinfo1.x & 0xffff; - int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16; - int totalgrp = scaleinfo1.y & 0xffff; - float factor = as_float(scaleinfo1.w); - float correction_t = correction[scalei]; - int ystep = (int)(max(2.0f, factor) + 0.5f); + int4 cascadeinfo = p[scalei]; + int grpidy = grploop / grpnumperline; + int grpidx = grploop - mul24(grpidy, grpnumperline); + int ix = mad24(grpidx, grpszx, lclidx); + int iy = mad24(grpidy, grpszy, lclidy); + int x = ix * ystep; + int y = iy * ystep; + lcloutindex[lcl_id] = 0; + lclcount[0] = 0; + int nodecounter; + float mean, variance_norm_factor; + //if((ix < width) && (iy < height)) + { + const int p_offset = mad24(y, step, x); + cascadeinfo.x += p_offset; + cascadeinfo.z += p_offset; + mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] - + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)]) + * correction_t; + variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] - + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)]; + variance_norm_factor = variance_norm_factor * correction_t - mean * mean; + variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f; + bool result = true; + nodecounter = startnode + nodecount * scalei; - for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx) + for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++) { - int4 cascadeinfo = p[scalei]; - int grpidy = grploop / grpnumperline; - int grpidx = grploop - mul24(grpidy, grpnumperline); - int ix = mad24(grpidx, grpszx, lclidx); - int iy = mad24(grpidy, grpszy, lclidy); - int x = ix * ystep; - int y = iy * ystep; - lcloutindex[lcl_id] = 0; - lclcount[0] = 0; - int result = 1, nodecounter; - float mean, variance_norm_factor; - //if((ix < width) && (iy < height)) - { - const int p_offset = mad24(y, step, x); - cascadeinfo.x += p_offset; - cascadeinfo.z += p_offset; - mean = (sum[mad24(cascadeinfo.y, step, cascadeinfo.x)] - sum[mad24(cascadeinfo.y, step, cascadeinfo.z)] - - sum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sum[mad24(cascadeinfo.w, step, cascadeinfo.z)]) - * correction_t; - variance_norm_factor = sqsum[mad24(cascadeinfo.y, step, cascadeinfo.x)] - sqsum[mad24(cascadeinfo.y, step, cascadeinfo.z)] - - sqsum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sqsum[mad24(cascadeinfo.w, step, cascadeinfo.z)]; - variance_norm_factor = variance_norm_factor * correction_t - mean * mean; - variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f; - result = 1; - nodecounter = startnode + nodecount * scalei; - - for (int stageloop = start_stage; stageloop < end_stage && result; stageloop++) - { - float stage_sum = 0.f; - int4 stageinfo = *(global int4 *)(stagecascadeptr + stageloop); - float stagethreshold = as_float(stageinfo.y); - - for (int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++) - { - __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter); - int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0])); - int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0])); - int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0])); - float4 w = *(__global float4 *)(&(currentnodeptr->weight[0])); - float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0])); - float nodethreshold = w.w * variance_norm_factor; - info1.x += p_offset; - info1.z += p_offset; - info2.x += p_offset; - info2.z += p_offset; - float classsum = (sum[mad24(info1.y, step, info1.x)] - sum[mad24(info1.y, step, info1.z)] - - sum[mad24(info1.w, step, info1.x)] + sum[mad24(info1.w, step, info1.z)]) * w.x; - classsum += (sum[mad24(info2.y, step, info2.x)] - sum[mad24(info2.y, step, info2.z)] - - sum[mad24(info2.w, step, info2.x)] + sum[mad24(info2.w, step, info2.z)]) * w.y; - info3.x += p_offset; - info3.z += p_offset; - classsum += (sum[mad24(info3.y, step, info3.x)] - sum[mad24(info3.y, step, info3.z)] - - sum[mad24(info3.w, step, info3.x)] + sum[mad24(info3.w, step, info3.z)]) * w.z; - stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; - nodecounter++; - } - - result = (stage_sum >= stagethreshold); - } - - if (result && (ix < width) && (iy < height)) - { - int queueindex = atomic_inc(lclcount); - lcloutindex[queueindex << 1] = (y << 16) | x; - lcloutindex[(queueindex << 1) + 1] = as_int(variance_norm_factor); - } - - barrier(CLK_LOCAL_MEM_FENCE); - int queuecount = lclcount[0]; - nodecounter = splitnode + nodecount * scalei; - - if (lcl_id < queuecount) - { - int temp = lcloutindex[lcl_id << 1]; - int x = temp & 0xffff; - int y = (temp & (int)0xffff0000) >> 16; - temp = glboutindex[0]; - int4 candidate_result; - candidate_result.zw = (int2)convert_int_rtn(factor * 20.f); - candidate_result.x = x; - candidate_result.y = y; - atomic_inc(glboutindex); - candidate[outputoff + temp + lcl_id] = candidate_result; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } + float stage_sum = 0.f; + int stagecount = stagecascadeptr[stageloop].count; + for (int nodeloop = 0; nodeloop < stagecount; nodeloop++) + { + __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter); + int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0])); + int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0])); + int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0])); + float4 w = *(__global float4 *)(&(currentnodeptr->weight[0])); + float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0])); + float nodethreshold = w.w * variance_norm_factor; + info1.x += p_offset; + info1.z += p_offset; + info2.x += p_offset; + info2.z += p_offset; + float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)] - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] - + sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)] + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x; + classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)] - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] - + sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)] + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y; + info3.x += p_offset; + info3.z += p_offset; + classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)] - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] - + sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)] + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z; + stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; + nodecounter++; + } + result = (bool)(stage_sum >= stagecascadeptr[stageloop].threshold); } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (result && (ix < width) && (iy < height)) + { + int queueindex = atomic_inc(lclcount); + lcloutindex[queueindex] = (y << 16) | x; + } + + barrier(CLK_LOCAL_MEM_FENCE); + int queuecount = lclcount[0]; + + if (lcl_id < queuecount) + { + int temp = lcloutindex[lcl_id]; + int x = temp & 0xffff; + int y = (temp & (int)0xffff0000) >> 16; + temp = atomic_inc(glboutindex); + int4 candidate_result; + candidate_result.zw = (int2)convert_int_rtn(factor * 20.f); + candidate_result.x = x; + candidate_result.y = y; + candidate[outputoff + temp + lcl_id] = candidate_result; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } } + } } __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum) { - int counter = get_global_id(0); - int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0; - GpuHidHaarTreeNode t1 = *(orinode + counter); + int counter = get_global_id(0); + int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0; + GpuHidHaarTreeNode t1 = *(orinode + counter); #pragma unroll - for (i = 0; i < 3; i++) - { - tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f); - tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f); - tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f); - tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f); - } + for (i = 0; i < 3; i++) + { + tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f); + tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f); + tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f); + tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f); + } - t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]); - counter += nodenum; + t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]); + counter += nodenum; #pragma unroll - for (i = 0; i < 3; i++) - { - newnode[counter].p[i][0] = tr_x[i]; - newnode[counter].p[i][1] = tr_y[i]; - newnode[counter].p[i][2] = tr_x[i] + tr_w[i]; - newnode[counter].p[i][3] = tr_y[i] + tr_h[i]; - newnode[counter].weight[i] = t1.weight[i] * weight_scale; - } + for (i = 0; i < 3; i++) + { + newnode[counter].p[i][0] = tr_x[i]; + newnode[counter].p[i][1] = tr_y[i]; + newnode[counter].p[i][2] = tr_x[i] + tr_w[i]; + newnode[counter].p[i][3] = tr_y[i] + tr_h[i]; + newnode[counter].weight[i] = t1.weight[i] * weight_scale; + } - newnode[counter].left = t1.left; - newnode[counter].right = t1.right; - newnode[counter].threshold = t1.threshold; - newnode[counter].alpha[0] = t1.alpha[0]; - newnode[counter].alpha[1] = t1.alpha[1]; + newnode[counter].left = t1.left; + newnode[counter].right = t1.right; + newnode[counter].threshold = t1.threshold; + newnode[counter].alpha[0] = t1.alpha[0]; + newnode[counter].alpha[1] = t1.alpha[1]; } + diff --git a/modules/ocl/test/test_haar.cpp b/modules/ocl/test/test_haar.cpp index 9bff324662..19147b04b8 100644 --- a/modules/ocl/test/test_haar.cpp +++ b/modules/ocl/test/test_haar.cpp @@ -16,6 +16,7 @@ // // @Authors // Jia Haipeng, jiahaipeng95@gmail.com +// Sen Liu, swjutls1987@126.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -61,40 +62,31 @@ struct getRect } }; -PARAM_TEST_CASE(HaarTestBase, int, int) +PARAM_TEST_CASE(Haar, double, int) { - //std::vector oclinfo; cv::ocl::OclCascadeClassifier cascade, nestedCascade; + cv::ocl::OclCascadeClassifierBuf cascadebuf; cv::CascadeClassifier cpucascade, cpunestedCascade; - // Mat img; double scale; - int index; + int flags; virtual void SetUp() { - scale = 1.0; - index = 0; + scale = GET_PARAM(0); + flags = GET_PARAM(1); string cascadeName = workdir + "../../data/haarcascades/haarcascade_frontalface_alt.xml"; - if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName))) + if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) || (!cascadebuf.load( cascadeName ))) { cout << "ERROR: Could not load classifier cascade" << endl; return; } - //int devnums = getDevice(oclinfo); - //CV_Assert(devnums>0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); - //cv::ocl::setBinpath("E:\\"); } }; ////////////////////////////////faceDetect///////////////////////////////////////////////// - -struct Haar : HaarTestBase {}; - -TEST_F(Haar, FaceDetect) +TEST_P(Haar, FaceDetect) { string imgName = workdir + "lena.jpg"; Mat img = imread( imgName, 1 ); @@ -105,59 +97,65 @@ TEST_F(Haar, FaceDetect) return ; } - //int i = 0; - //double t = 0; vector faces, oclfaces; - // const static Scalar colors[] = { CV_RGB(0, 0, 255), - // CV_RGB(0, 128, 255), - // CV_RGB(0, 255, 255), - // CV_RGB(0, 255, 0), - // CV_RGB(255, 128, 0), - // CV_RGB(255, 255, 0), - // CV_RGB(255, 0, 0), - // CV_RGB(255, 0, 255) - // } ; - Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 ); MemStorage storage(cvCreateMemStorage(0)); cvtColor( img, gray, CV_BGR2GRAY ); resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR ); equalizeHist( smallImg, smallImg ); - cv::ocl::oclMat image; CvSeq *_objects; image.upload(smallImg); _objects = cascade.oclHaarDetectObjects( image, storage, 1.1, - 3, 0 - | CV_HAAR_SCALE_IMAGE - , Size(30, 30), Size(0, 0) ); + 3, flags, Size(30, 30), Size(0, 0) ); vector vecAvgComp; Seq(_objects).copyTo(vecAvgComp); oclfaces.resize(vecAvgComp.size()); std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect()); - cpucascade.detectMultiScale( smallImg, faces, 1.1, - 3, 0 - | CV_HAAR_SCALE_IMAGE - , Size(30, 30), Size(0, 0) ); + cpucascade.detectMultiScale( smallImg, faces, 1.1, 3, + flags, + Size(30, 30), Size(0, 0) ); EXPECT_EQ(faces.size(), oclfaces.size()); - /* for( vector::const_iterator r = faces.begin(); r != faces.end(); r++, i++ ) - { - Mat smallImgROI; - Point center; - Scalar color = colors[i%8]; - int radius; - center.x = cvRound((r->x + r->width*0.5)*scale); - center.y = cvRound((r->y + r->height*0.5)*scale); - radius = cvRound((r->width + r->height)*0.25*scale); - circle( img, center, radius, color, 3, 8, 0 ); - } */ - //namedWindow("result"); - //imshow("result",img); - //waitKey(0); - //destroyAllWindows(); - } + +TEST_P(Haar, FaceDetectUseBuf) +{ + string imgName = workdir + "lena.jpg"; + Mat img = imread( imgName, 1 ); + + if(img.empty()) + { + std::cout << "Couldn't read " << imgName << std::endl; + return ; + } + + vector faces, oclfaces; + + Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 ); + MemStorage storage(cvCreateMemStorage(0)); + cvtColor( img, gray, CV_BGR2GRAY ); + resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR ); + equalizeHist( smallImg, smallImg ); + + cv::ocl::oclMat image; + image.upload(smallImg); + + cascadebuf.detectMultiScale( image, oclfaces, 1.1, 3, + flags, + Size(30, 30), Size(0, 0) ); + cascadebuf.release(); + + cpucascade.detectMultiScale( smallImg, faces, 1.1, 3, + flags, + Size(30, 30), Size(0, 0) ); + EXPECT_EQ(faces.size(), oclfaces.size()); +} + +INSTANTIATE_TEST_CASE_P(FaceDetect, Haar, + Combine(Values(1.0), + Values(CV_HAAR_SCALE_IMAGE, 0))); + #endif // HAVE_OPENCL