LBP: switched to texture implementation

This commit is contained in:
Marina Kolpakova 2012-07-10 11:58:15 +00:00
parent b0606b0557
commit 86d785622b
6 changed files with 143 additions and 131 deletions

View File

@ -1435,7 +1435,7 @@ public:
bool load(const std::string& filename);
void release();
int detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4,
int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4,
cv::Size maxObjectSize = cv::Size()/*, Size minSize = Size()*/);
void preallocateIntegralBuffer(cv::Size desired);
Size getClassifierSize() const;

View File

@ -69,16 +69,14 @@ GPU_PERF_TEST_1(LBPClassifier, cv::gpu::DeviceInfo)
cv::gpu::GpuMat img(img_host);
cv::gpu::GpuMat gpu_rects, buffer;
cv::gpu::GpuMat gpu_rects;
cv::gpu::CascadeClassifier_GPU_LBP cascade(img.size());
ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
// cascade.detectMultiScale(img, objects_buffer);
cascade.detectMultiScale(img, buffer, gpu_rects);
cascade.detectMultiScale(img, gpu_rects);
TEST_CYCLE()
{
cascade.detectMultiScale(img, buffer, gpu_rects);
cascade.detectMultiScale(img, gpu_rects);
}
}

View File

@ -70,7 +70,7 @@ Size cv::gpu::CascadeClassifier_GPU_LBP::getClassifierSize() const
void cv::gpu::CascadeClassifier_GPU_LBP::preallocateIntegralBuffer(cv::Size /*desired*/) { throw_nogpu();}
void cv::gpu::CascadeClassifier_GPU_LBP::initializeBuffers(cv::Size /*frame*/) { throw_nogpu();}
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*scaledImageBuffer*/, cv::gpu::GpuMat& /*objectsBuf*/,
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*objectsBuf*/,
double /*scaleFactor*/, int /*minNeighbors*/, cv::Size /*maxObjectSize*/){ throw_nogpu(); return 0;}
#else
@ -299,28 +299,29 @@ namespace cv { namespace gpu { namespace device
{
namespace lbp
{
void classifyStump(const DevMem2Db mstages,
const int nstages,
const DevMem2Di mnodes,
const DevMem2Df mleaves,
const DevMem2Di msubsets,
const DevMem2Db mfeatures,
const DevMem2Di integral,
const int workWidth,
const int workHeight,
const int clWidth,
const int clHeight,
float scale,
int step,
int subsetSize,
DevMem2D_<int4> objects,
unsigned int* classified);
void classifyStump(const DevMem2Db& mstages,
const int nstages,
const DevMem2Di& mnodes,
const DevMem2Df& mleaves,
const DevMem2Di& msubsets,
const DevMem2Db& mfeatures,
const int workWidth,
const int workHeight,
const int clWidth,
const int clHeight,
float scale,
int step,
int subsetSize,
DevMem2D_<int4> objects,
unsigned int* classified);
int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
void bindIntegral(DevMem2Di integral);
void unbindIntegral();
}
}}}
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objects,
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects,
double scaleFactor, int groupThreshold, cv::Size maxObjectSize /*, Size minSize=Size()*/)
{
CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
@ -332,10 +333,12 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
if( !objects.empty() && objects.depth() == CV_32S)
objects.reshape(4, 1);
else
objects.create(1 , defaultObjSearchNum, CV_32SC4);
GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4);
// GpuMat candidates(objects);
objects.create(1 , image.cols >> 4, CV_32SC4);
GpuMat candidates(1 , image.cols >> 1, CV_32SC4);
// GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4);
// used for debug
// candidates.setTo(cv::Scalar::all(0));
// objects.setTo(cv::Scalar::all(0));
if (maxObjectSize == cv::Size())
maxObjectSize = image.size();
@ -347,9 +350,11 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
cudaMalloc(&dclassified, sizeof(int));
cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice);
int step;
cv::gpu::device::lbp::bindIntegral(integral);
for( double factor = 1; ; factor *= scaleFactor )
{
// if (factor > 2.0) break;
cv::Size windowSize(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
cv::Size scaledImageSize(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
@ -365,7 +370,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
GpuMat scaledImg(resuzeBuffer, cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height));
GpuMat scaledIntegral(integral, cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1));
GpuMat currBuff = integralBuffer;//(integralBuffer, cv::Rect(0, 0, integralBuffer.width, integralBuffer.height));
GpuMat currBuff = integralBuffer;
cv::gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR);
cv::gpu::integralBuffered(scaledImg, scaledIntegral, currBuff);
@ -373,8 +378,10 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
step = (factor <= 2.) + 1;
cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
scaledIntegral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);
processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);
}
cv::gpu::device::lbp::unbindIntegral();
if (groupThreshold <= 0 || objects.empty())
return 0;
cv::gpu::device::lbp::connectedConmonents(candidates, objects, groupThreshold, grouping_eps, dclassified);

View File

@ -48,8 +48,102 @@ namespace cv { namespace gpu { namespace device
{
namespace lbp
{
texture<int, cudaTextureType2D, cudaReadModeElementType> tintegral(false, cudaFilterModePoint, cudaAddressModeClamp);
struct LBP
{
__device__ __forceinline__ LBP(const LBP& other) {(void)other;}
__device__ __forceinline__ LBP() {}
//feature as uchar x, y - left top, z,w - right bottom
__device__ __forceinline__ int operator() (int ty, int tx, int fh, int featurez, int& shift) const
{
int anchors[9];
anchors[0] = tex2D(tintegral, tx, ty);
anchors[1] = tex2D(tintegral, tx + featurez, ty);
anchors[0] -= anchors[1];
anchors[2] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[1] -= anchors[2];
anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
ty += fh;
anchors[3] = tex2D(tintegral, tx, ty);
anchors[4] = tex2D(tintegral, tx + featurez, ty);
anchors[3] -= anchors[4];
anchors[5] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[4] -= anchors[5];
anchors[5] -= tex2D(tintegral, tx + featurez * 3, ty);
anchors[0] -= anchors[3];
anchors[1] -= anchors[4];
anchors[2] -= anchors[5];
// 0 - 2 contains s0 - s2
ty += fh;
anchors[6] = tex2D(tintegral, tx, ty);
anchors[7] = tex2D(tintegral, tx + featurez, ty);
anchors[6] -= anchors[7];
anchors[8] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[7] -= anchors[8];
anchors[8] -= tex2D(tintegral, tx + featurez * 3, ty);
anchors[3] -= anchors[6];
anchors[4] -= anchors[7];
anchors[5] -= anchors[8];
// 3 - 5 contains s3 - s5
anchors[0] -= anchors[4];
anchors[1] -= anchors[4];
anchors[2] -= anchors[4];
anchors[3] -= anchors[4];
anchors[5] -= anchors[4];
int response = (~(anchors[0] >> 31)) & 4;
response |= (~(anchors[1] >> 31)) & 2;;
response |= (~(anchors[2] >> 31)) & 1;
shift = (~(anchors[5] >> 31)) & 16;
shift |= (~(anchors[3] >> 31)) & 1;
ty += fh;
anchors[0] = tex2D(tintegral, tx, ty);
anchors[1] = tex2D(tintegral, tx + featurez, ty);
anchors[0] -= anchors[1];
anchors[2] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[1] -= anchors[2];
anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
anchors[6] -= anchors[0];
anchors[7] -= anchors[1];
anchors[8] -= anchors[2];
// 0 -2 contains s6 - s8
anchors[6] -= anchors[4];
anchors[7] -= anchors[4];
anchors[8] -= anchors[4];
shift |= (~(anchors[6] >> 31)) & 2;
shift |= (~(anchors[7] >> 31)) & 4;
shift |= (~(anchors[8] >> 31)) & 8;
return response;
}
};
void bindIntegral(DevMem2Di integral)
{
cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
cudaSafeCall( cudaBindTexture2D(0, &tintegral, integral.ptr(), &desc, (size_t)integral.cols, (size_t)integral.rows, (size_t)integral.step));
}
void unbindIntegral()
{
cudaSafeCall( cudaUnbindTexture(&tintegral));
}
__global__ void lbp_classify_stump(const Stage* stages, const int nstages, const ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features,
const int* integral, const int istep, const int workWidth,const int workHeight, const int clWidth, const int clHeight, const float scale, const int step,
/* const int* integral,const int istep, const int workWidth,const int workHeight,*/ const int clWidth, const int clHeight, const float scale, const int step,
const int subsetSize, DevMem2D_<int4> objects, unsigned int* n)
{
int x = threadIdx.x * step;
@ -63,21 +157,18 @@ namespace cv { namespace gpu { namespace device
{
float sum = 0;
Stage stage = stages[s];
for (int t = 0; t < stage.ntrees; t++)
{
ClNode node = nodes[current_node];
uchar4 feature = features[node.featureIdx];
int c = evaluator( (y + feature.y) * istep + x + feature.x , feature.w * istep, feature.z, integral, istep);
const int* subsetIdx = subsets + (current_node * subsetSize);
int idx = (subsetIdx[c >> 5] & ( 1 << (c & 31))) ? current_leave : current_leave + 1;
int shift;
int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
sum += leaves[idx];
current_node += 1;
current_leave += 2;
}
if (sum < stage.threshold)
return;
}
@ -85,8 +176,8 @@ namespace cv { namespace gpu { namespace device
int4 rect;
rect.x = roundf(x * scale);
rect.y = roundf(y * scale);
rect.z = roundf(clWidth);
rect.w = roundf(clHeight);
rect.z = clWidth;
rect.w = clHeight;
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
int res = __atomicInc(n, 100U);
#else
@ -178,8 +269,8 @@ namespace cv { namespace gpu { namespace device
}
}
void classifyStump(const DevMem2Db mstages, const int nstages, const DevMem2Di mnodes, const DevMem2Df mleaves, const DevMem2Di msubsets, const DevMem2Db mfeatures,
const DevMem2Di integral, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
/*const DevMem2Di& integral,*/ const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
DevMem2D_<int4> objects, unsigned int* classified)
{
int blocks = ceilf(workHeight / (float)step);
@ -190,11 +281,8 @@ namespace cv { namespace gpu { namespace device
const float* leaves = mleaves.ptr();
const int* subsets = msubsets.ptr();
const uchar4* features = (uchar4*)(mfeatures.ptr());
const int* integ = integral.ptr();
int istep = integral.step / sizeof(int);
lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, integ, istep,
workWidth, workHeight, clWidth, clHeight, scale, step, subsetSize, objects, classified);
lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, /*integ, istep,
workWidth, workHeight,*/ clWidth, clHeight, scale, step, subsetSize, objects, classified);
}
int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)

View File

@ -153,90 +153,8 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
__syncthreads();
// printf("tid %d label %d\n", tid, labels[tid]);
}
struct LBP
{
__device__ __forceinline__ LBP(const LBP& other) {(void)other;}
__device__ __forceinline__ LBP() {}
//feature as uchar x, y - left top, z,w - right bottom
__device__ __forceinline__ int operator() (unsigned int y, int featurew, int featurez, const int* integral, int step) const
{
int x_off = 2 * featurez;
int anchors[9];
anchors[0] = integral[y];
anchors[1] = integral[y + featurez];
anchors[0] -= anchors[1];
anchors[2] = integral[y + x_off];
anchors[1] -= anchors[2];
anchors[2] -= integral[y + featurez + x_off];
y += featurew;
anchors[3] = integral[y];
anchors[4] = integral[y + featurez];
anchors[3] -= anchors[4];
anchors[5] = integral[y + x_off];
anchors[4] -= anchors[5];
anchors[5] -= integral[y + featurez + x_off];
anchors[0] -= anchors[3];
anchors[1] -= anchors[4];
anchors[2] -= anchors[5];
// 0 - 2 contains s0 - s2
y += featurew;
anchors[6] = integral[y];
anchors[7] = integral[y + featurez];
anchors[6] -= anchors[7];
anchors[8] = integral[y + x_off];
anchors[7] -= anchors[8];
anchors[8] -= integral[y + x_off + featurez];
anchors[3] -= anchors[6];
anchors[4] -= anchors[7];
anchors[5] -= anchors[8];
// 3 - 5 contains s3 - s5
anchors[0] -= anchors[4];
anchors[1] -= anchors[4];
anchors[2] -= anchors[4];
anchors[3] -= anchors[4];
anchors[5] -= anchors[4];
int response = (~(anchors[0] >> 31)) & 128;
response |= (~(anchors[1] >> 31)) & 64;;
response |= (~(anchors[2] >> 31)) & 32;
response |= (~(anchors[5] >> 31)) & 16;
response |= (~(anchors[3] >> 31)) & 1;
y += featurew;
anchors[0] = integral[y];
anchors[1] = integral[y + featurez];
anchors[0] -= anchors[1];
anchors[2] = integral[y + x_off];
anchors[1] -= anchors[2];
anchors[2] -= integral[y + x_off + featurez];
anchors[6] -= anchors[0];
anchors[7] -= anchors[1];
anchors[8] -= anchors[2];
// 0 -2 contains s6 - s8
anchors[6] -= anchors[4];
anchors[7] -= anchors[4];
anchors[8] -= anchors[4];
response |= (~(anchors[6] >> 31)) & 2;
response |= (~(anchors[7] >> 31)) & 4;
response |= (~(anchors[8] >> 31)) & 8;
return response;
}
};
} // lbp
} } }// namespaces
#endif

View File

@ -343,15 +343,16 @@ TEST_P(LBP_classify, Accuracy)
cv::gpu::CascadeClassifier_GPU_LBP gpuClassifier;
ASSERT_TRUE(gpuClassifier.load(classifierXmlPath));
cv::gpu::GpuMat gpu_rects, buffer;
cv::gpu::GpuMat gpu_rects;
cv::gpu::GpuMat tested(grey);
int count = gpuClassifier.detectMultiScale(tested, buffer, gpu_rects);
int count = gpuClassifier.detectMultiScale(tested, gpu_rects);
cv::Mat gpu_f(gpu_rects);
int* gpu_faces = (int*)gpu_f.ptr();
for (int i = 0; i < count; i++)
{
cv::Rect r(gpu_faces[i * 4],gpu_faces[i * 4 + 1],gpu_faces[i * 4 + 2],gpu_faces[i * 4 + 3]);
std::cout << gpu_faces[i * 4]<< " " << gpu_faces[i * 4 + 1] << " " << gpu_faces[i * 4 + 2] << " " << gpu_faces[i * 4 + 3] << std::endl;
cv::rectangle(markedImage, r , cv::Scalar(0, 0, 255, 255));
}
}