mirror of
https://github.com/opencv/opencv.git
synced 2024-11-25 11:40:44 +08:00
minor LBP for GPU
This commit is contained in:
parent
5120d690f0
commit
a2430afcac
@ -282,7 +282,7 @@ namespace cv { namespace gpu { namespace device
|
||||
DevMem2D_<int4> objects,
|
||||
unsigned int* classified);
|
||||
|
||||
int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
|
||||
void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
|
||||
void bindIntegral(DevMem2Di integral);
|
||||
void unbindIntegral();
|
||||
}
|
||||
@ -294,7 +294,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
|
||||
CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U);
|
||||
|
||||
const int defaultObjSearchNum = 100;
|
||||
const float grouping_eps = 0.2;
|
||||
const float grouping_eps = 0.2f;
|
||||
|
||||
if( !objects.empty() && objects.depth() == CV_32S)
|
||||
objects.reshape(4, 1);
|
||||
|
@ -216,10 +216,10 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
struct Classifier
|
||||
{
|
||||
__host__ __device__ __forceinline__ Classifier(const int* _integral, const int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves, const int* _subsets, const uchar4* _features,
|
||||
const int _nstages, const int _clWidth, const int _clHeight, const float _scale, const int _step, const int _subsetSize)
|
||||
: integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), clWidth(_clWidth), clHeight(_clHeight),
|
||||
scale(_scale), step(_step), subsetSize(_subsetSize){}
|
||||
__host__ __device__ __forceinline__ Classifier(const int* _integral, int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves,
|
||||
const int* _subsets, const uchar4* _features, int _nstages, int _clWidth, int _clHeight, float _scale, int _step, int _subsetSize)
|
||||
: integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages),
|
||||
clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){}
|
||||
|
||||
__device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
|
||||
{
|
||||
@ -255,7 +255,7 @@ namespace cv { namespace gpu { namespace device
|
||||
rect.z = clWidth;
|
||||
rect.w = clHeight;
|
||||
|
||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
||||
#if (__CUDA_ARCH__ < 120)
|
||||
int res = __atomicInc(n, maxN);
|
||||
#else
|
||||
int res = atomicInc(n, maxN);
|
||||
@ -305,7 +305,7 @@ namespace cv { namespace gpu { namespace device
|
||||
extern __shared__ int sbuff[];
|
||||
|
||||
int* labels = sbuff;
|
||||
int* rrects = (int*)(sbuff + n);
|
||||
int* rrects = sbuff + n;
|
||||
|
||||
Pr predicate(grouping_eps);
|
||||
partition(candidates, n, labels, predicate);
|
||||
@ -317,7 +317,7 @@ namespace cv { namespace gpu { namespace device
|
||||
__syncthreads();
|
||||
|
||||
int cls = labels[tid];
|
||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
||||
#if (__CUDA_ARCH__ < 120)
|
||||
__atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
|
||||
__atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
|
||||
__atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
|
||||
@ -332,7 +332,7 @@ namespace cv { namespace gpu { namespace device
|
||||
labels[tid] = 0;
|
||||
|
||||
__syncthreads();
|
||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
||||
#if (__CUDA_ARCH__ < 120)
|
||||
__atomicInc((unsigned int*)labels + cls, n);
|
||||
#else
|
||||
atomicInc((unsigned int*)labels + cls, n);
|
||||
@ -354,13 +354,10 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
if (active && active >= groupThreshold)
|
||||
{
|
||||
int* r1 = rrects + tid * 4;
|
||||
int4 r_out;
|
||||
r_out.x = r1[0];
|
||||
r_out.y = r1[1];
|
||||
r_out.z = r1[2];
|
||||
r_out.w = r1[3];
|
||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
||||
int* r1 = rrects + tid * 4;
|
||||
int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
|
||||
|
||||
#if (__CUDA_ARCH__ < 120)
|
||||
objects[__atomicInc(nclasses, n)] = r_out;
|
||||
#else
|
||||
int aidx = atomicInc(nclasses, n);
|
||||
@ -371,21 +368,24 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
void classifyStumpFixed(const DevMem2Di& integral, const int pitch, const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
|
||||
const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
|
||||
{
|
||||
const int THREADS_BLOCK = 256;
|
||||
int work_amount = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
|
||||
int blocks = divUp(work_amount, THREADS_BLOCK);
|
||||
{
|
||||
Classifier clr(integral, pitch, (Stage*)mstages.ptr(), (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets,
|
||||
(uchar4*)mfeatures.ptr(), nstages, clWidth, clHeight, scale, step, subsetSize);
|
||||
|
||||
Classifier clr(integral.ptr(), pitch, (Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
|
||||
lbp_classify_stump<<<blocks, THREADS_BLOCK>>>(clr, objects, objects.cols, classified, workWidth >> 1);
|
||||
int total = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
|
||||
|
||||
int block = 256;
|
||||
int grid = divUp(total, block);
|
||||
lbp_classify_stump<<<grid, block>>>(clr, objects, objects.cols, classified, workWidth >> 1);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
|
||||
int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
|
||||
void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
|
||||
{
|
||||
int threads = ncandidates;
|
||||
int smem_amount = threads * sizeof(int) + threads * sizeof(int4);
|
||||
disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), ncandidates, groupThreshold, grouping_eps, nclasses);
|
||||
return 0;
|
||||
int block = ncandidates;
|
||||
int smem = block * ( sizeof(int) + sizeof(int4) );
|
||||
disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
}
|
||||
}}}
|
@ -49,17 +49,55 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
struct Emulation
|
||||
{
|
||||
static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
|
||||
template<int CTA_SIZE>
|
||||
static __forceinline__ __device__ int Ballot(int predicate)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 200
|
||||
(void)cta_buffer;
|
||||
#if (__CUDA_ARCH__ >= 200)
|
||||
return __ballot(predicate);
|
||||
#else
|
||||
#else
|
||||
__shared__ volatile int cta_buffer[CTA_SIZE]
|
||||
|
||||
int tid = threadIdx.x;
|
||||
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
|
||||
return warp_reduce(cta_buffer);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
struct smem
|
||||
{
|
||||
enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
|
||||
|
||||
template<typename T>
|
||||
static __device__ __forceinline__ T atomicInc(T* address, T val)
|
||||
{
|
||||
#if (__CUDA_ARCH__ < 120)
|
||||
|
||||
#else
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __device__ __forceinline__ void atomicAdd(T* address, T val)
|
||||
{
|
||||
#if (__CUDA_ARCH__ < 120)
|
||||
|
||||
#else
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__device__ __forceinline__ T __atomicMin(T* address, T val)
|
||||
{
|
||||
#if (__CUDA_ARCH__ < 120)
|
||||
|
||||
#else
|
||||
|
||||
#endif
|
||||
}
|
||||
};
|
||||
};
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
|
@ -50,45 +50,46 @@ namespace cv { namespace gpu { namespace device {
|
||||
namespace lbp{
|
||||
|
||||
#define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
|
||||
template<typename T>
|
||||
__device__ __forceinline__ T __atomicInc(T* address, T val)
|
||||
{
|
||||
T count;
|
||||
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
|
||||
do
|
||||
{
|
||||
count = *address & TAG_MASK;
|
||||
count = tag | (count + 1);
|
||||
*address = count;
|
||||
} while (*address != count);
|
||||
|
||||
template<typename T>
|
||||
__device__ __forceinline__ T __atomicInc(T* address, T val)
|
||||
{
|
||||
T count;
|
||||
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
|
||||
do
|
||||
{
|
||||
count = *address & TAG_MASK;
|
||||
count = tag | (count + 1);
|
||||
*address = count;
|
||||
} while (*address != count);
|
||||
|
||||
return (count & TAG_MASK) - 1;
|
||||
}
|
||||
return (count & TAG_MASK) - 1;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__device__ __forceinline__ void __atomicAdd(T* address, T val)
|
||||
{
|
||||
T count;
|
||||
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
|
||||
do
|
||||
{
|
||||
count = *address & TAG_MASK;
|
||||
count = tag | (count + val);
|
||||
*address = count;
|
||||
} while (*address != count);
|
||||
}
|
||||
template<typename T>
|
||||
__device__ __forceinline__ void __atomicAdd(T* address, T val)
|
||||
{
|
||||
T count;
|
||||
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
|
||||
do
|
||||
{
|
||||
count = *address & TAG_MASK;
|
||||
count = tag | (count + val);
|
||||
*address = count;
|
||||
} while (*address != count);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__device__ __forceinline__ T __atomicMin(T* address, T val)
|
||||
{
|
||||
T count = min(*address, val);
|
||||
do
|
||||
{
|
||||
*address = count;
|
||||
} while (*address > count);
|
||||
template<typename T>
|
||||
__device__ __forceinline__ T __atomicMin(T* address, T val)
|
||||
{
|
||||
T count = min(*address, val);
|
||||
do
|
||||
{
|
||||
*address = count;
|
||||
} while (*address > count);
|
||||
|
||||
return count;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
struct Stage
|
||||
{
|
||||
@ -112,7 +113,7 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
|
||||
|
||||
__device__ __forceinline__ bool operator()(const int4& r1, const int4& r2) const
|
||||
{
|
||||
float delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w)) * 0.5;
|
||||
float delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w)) * 0.5f;
|
||||
|
||||
return abs(r1.x - r2.x) <= delta && abs(r1.y - r2.y) <= delta
|
||||
&& abs(r1.x + r1.z - r2.x - r2.z) <= delta && abs(r1.y + r1.w - r2.y - r2.w) <= delta;
|
||||
@ -134,22 +135,15 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
|
||||
int p = labels[tid];
|
||||
int q = labels[id];
|
||||
|
||||
if (p < q)
|
||||
{
|
||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
||||
__atomicMin(labels + id, p);
|
||||
if (p != q)
|
||||
{
|
||||
int m = min(p, q);
|
||||
#if (__CUDA_ARCH__ < 120)
|
||||
__atomicMin(labels + id, m);
|
||||
#else
|
||||
atomicMin(labels + id, p);
|
||||
atomicMin(labels + id, m);
|
||||
#endif
|
||||
}
|
||||
else if (p > q)
|
||||
{
|
||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
||||
__atomicMin(labels + tid, q);
|
||||
#else
|
||||
atomicMin(labels + tid, q);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
Loading…
Reference in New Issue
Block a user