minor LBP for GPU

This commit is contained in:
Anatoly Baksheev 2012-07-14 16:23:56 +00:00
parent 5120d690f0
commit a2430afcac
4 changed files with 115 additions and 83 deletions

View File

@ -282,7 +282,7 @@ namespace cv { namespace gpu { namespace device
DevMem2D_<int4> objects, DevMem2D_<int4> objects,
unsigned int* classified); unsigned int* classified);
int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses); void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
void bindIntegral(DevMem2Di integral); void bindIntegral(DevMem2Di integral);
void unbindIntegral(); void unbindIntegral();
} }
@ -294,7 +294,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U); CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U);
const int defaultObjSearchNum = 100; const int defaultObjSearchNum = 100;
const float grouping_eps = 0.2; const float grouping_eps = 0.2f;
if( !objects.empty() && objects.depth() == CV_32S) if( !objects.empty() && objects.depth() == CV_32S)
objects.reshape(4, 1); objects.reshape(4, 1);

View File

@ -216,10 +216,10 @@ namespace cv { namespace gpu { namespace device
struct Classifier struct Classifier
{ {
__host__ __device__ __forceinline__ Classifier(const int* _integral, const int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves, const int* _subsets, const uchar4* _features, __host__ __device__ __forceinline__ Classifier(const int* _integral, int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves,
const int _nstages, const int _clWidth, const int _clHeight, const float _scale, const int _step, const int _subsetSize) const int* _subsets, const uchar4* _features, int _nstages, int _clWidth, int _clHeight, float _scale, int _step, int _subsetSize)
: integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), clWidth(_clWidth), clHeight(_clHeight), : integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages),
scale(_scale), step(_step), subsetSize(_subsetSize){} clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){}
__device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const __device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
{ {
@ -255,7 +255,7 @@ namespace cv { namespace gpu { namespace device
rect.z = clWidth; rect.z = clWidth;
rect.w = clHeight; rect.w = clHeight;
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) #if (__CUDA_ARCH__ < 120)
int res = __atomicInc(n, maxN); int res = __atomicInc(n, maxN);
#else #else
int res = atomicInc(n, maxN); int res = atomicInc(n, maxN);
@ -305,7 +305,7 @@ namespace cv { namespace gpu { namespace device
extern __shared__ int sbuff[]; extern __shared__ int sbuff[];
int* labels = sbuff; int* labels = sbuff;
int* rrects = (int*)(sbuff + n); int* rrects = sbuff + n;
Pr predicate(grouping_eps); Pr predicate(grouping_eps);
partition(candidates, n, labels, predicate); partition(candidates, n, labels, predicate);
@ -317,7 +317,7 @@ namespace cv { namespace gpu { namespace device
__syncthreads(); __syncthreads();
int cls = labels[tid]; int cls = labels[tid];
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) #if (__CUDA_ARCH__ < 120)
__atomicAdd((rrects + cls * 4 + 0), candidates[tid].x); __atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
__atomicAdd((rrects + cls * 4 + 1), candidates[tid].y); __atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
__atomicAdd((rrects + cls * 4 + 2), candidates[tid].z); __atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
@ -332,7 +332,7 @@ namespace cv { namespace gpu { namespace device
labels[tid] = 0; labels[tid] = 0;
__syncthreads(); __syncthreads();
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) #if (__CUDA_ARCH__ < 120)
__atomicInc((unsigned int*)labels + cls, n); __atomicInc((unsigned int*)labels + cls, n);
#else #else
atomicInc((unsigned int*)labels + cls, n); atomicInc((unsigned int*)labels + cls, n);
@ -354,13 +354,10 @@ namespace cv { namespace gpu { namespace device
if (active && active >= groupThreshold) if (active && active >= groupThreshold)
{ {
int* r1 = rrects + tid * 4; int* r1 = rrects + tid * 4;
int4 r_out; int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
r_out.x = r1[0];
r_out.y = r1[1]; #if (__CUDA_ARCH__ < 120)
r_out.z = r1[2];
r_out.w = r1[3];
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
objects[__atomicInc(nclasses, n)] = r_out; objects[__atomicInc(nclasses, n)] = r_out;
#else #else
int aidx = atomicInc(nclasses, n); int aidx = atomicInc(nclasses, n);
@ -371,21 +368,24 @@ namespace cv { namespace gpu { namespace device
void classifyStumpFixed(const DevMem2Di& integral, const int pitch, const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures, void classifyStumpFixed(const DevMem2Di& integral, const int pitch, const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified) const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
{ {
const int THREADS_BLOCK = 256; Classifier clr(integral, pitch, (Stage*)mstages.ptr(), (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets,
int work_amount = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step); (uchar4*)mfeatures.ptr(), nstages, clWidth, clHeight, scale, step, subsetSize);
int blocks = divUp(work_amount, THREADS_BLOCK);
Classifier clr(integral.ptr(), pitch, (Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize); int total = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
lbp_classify_stump<<<blocks, THREADS_BLOCK>>>(clr, objects, objects.cols, classified, workWidth >> 1);
int block = 256;
int grid = divUp(total, block);
lbp_classify_stump<<<grid, block>>>(clr, objects, objects.cols, classified, workWidth >> 1);
cudaSafeCall( cudaGetLastError() );
} }
int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses) void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
{ {
int threads = ncandidates; int block = ncandidates;
int smem_amount = threads * sizeof(int) + threads * sizeof(int4); int smem = block * ( sizeof(int) + sizeof(int4) );
disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), ncandidates, groupThreshold, grouping_eps, nclasses); disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
return 0; cudaSafeCall( cudaGetLastError() );
} }
} }
}}} }}}

View File

@ -49,17 +49,55 @@ namespace cv { namespace gpu { namespace device
{ {
struct Emulation struct Emulation
{ {
static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer) template<int CTA_SIZE>
static __forceinline__ __device__ int Ballot(int predicate)
{ {
#if __CUDA_ARCH__ >= 200 #if (__CUDA_ARCH__ >= 200)
(void)cta_buffer;
return __ballot(predicate); return __ballot(predicate);
#else #else
__shared__ volatile int cta_buffer[CTA_SIZE]
int tid = threadIdx.x; int tid = threadIdx.x;
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0; cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
return warp_reduce(cta_buffer); return warp_reduce(cta_buffer);
#endif #endif
} }
struct smem
{
enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
template<typename T>
static __device__ __forceinline__ T atomicInc(T* address, T val)
{
#if (__CUDA_ARCH__ < 120)
#else
#endif
}
template<typename T>
static __device__ __forceinline__ void atomicAdd(T* address, T val)
{
#if (__CUDA_ARCH__ < 120)
#else
#endif
}
template<typename T>
__device__ __forceinline__ T __atomicMin(T* address, T val)
{
#if (__CUDA_ARCH__ < 120)
#else
#endif
}
};
}; };
}}} // namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device

View File

@ -50,45 +50,46 @@ namespace cv { namespace gpu { namespace device {
namespace lbp{ namespace lbp{
#define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U ) #define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
template<typename T>
__device__ __forceinline__ T __atomicInc(T* address, T val) template<typename T>
{ __device__ __forceinline__ T __atomicInc(T* address, T val)
T count; {
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U); T count;
do unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
{ do
count = *address & TAG_MASK; {
count = tag | (count + 1); count = *address & TAG_MASK;
*address = count; count = tag | (count + 1);
} while (*address != count); *address = count;
} while (*address != count);
return (count & TAG_MASK) - 1; return (count & TAG_MASK) - 1;
} }
template<typename T> template<typename T>
__device__ __forceinline__ void __atomicAdd(T* address, T val) __device__ __forceinline__ void __atomicAdd(T* address, T val)
{ {
T count; T count;
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U); unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
do do
{ {
count = *address & TAG_MASK; count = *address & TAG_MASK;
count = tag | (count + val); count = tag | (count + val);
*address = count; *address = count;
} while (*address != count); } while (*address != count);
} }
template<typename T> template<typename T>
__device__ __forceinline__ T __atomicMin(T* address, T val) __device__ __forceinline__ T __atomicMin(T* address, T val)
{ {
T count = min(*address, val); T count = min(*address, val);
do do
{ {
*address = count; *address = count;
} while (*address > count); } while (*address > count);
return count; return count;
} }
struct Stage struct Stage
{ {
@ -112,7 +113,7 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
__device__ __forceinline__ bool operator()(const int4& r1, const int4& r2) const __device__ __forceinline__ bool operator()(const int4& r1, const int4& r2) const
{ {
float delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w)) * 0.5; float delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w)) * 0.5f;
return abs(r1.x - r2.x) <= delta && abs(r1.y - r2.y) <= delta return abs(r1.x - r2.x) <= delta && abs(r1.y - r2.y) <= delta
&& abs(r1.x + r1.z - r2.x - r2.z) <= delta && abs(r1.y + r1.w - r2.y - r2.w) <= delta; && abs(r1.x + r1.z - r2.x - r2.z) <= delta && abs(r1.y + r1.w - r2.y - r2.w) <= delta;
@ -134,22 +135,15 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
int p = labels[tid]; int p = labels[tid];
int q = labels[id]; int q = labels[id];
if (p < q) if (p != q)
{ {
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) int m = min(p, q);
__atomicMin(labels + id, p); #if (__CUDA_ARCH__ < 120)
__atomicMin(labels + id, m);
#else #else
atomicMin(labels + id, p); atomicMin(labels + id, m);
#endif #endif
} }
else if (p > q)
{
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
__atomicMin(labels + tid, q);
#else
atomicMin(labels + tid, q);
#endif
}
} }
} }
__syncthreads(); __syncthreads();