mirror of
https://github.com/opencv/opencv.git
synced 2024-11-29 05:29:54 +08:00
minor LBP for GPU
This commit is contained in:
parent
5120d690f0
commit
a2430afcac
@ -282,7 +282,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
DevMem2D_<int4> objects,
|
DevMem2D_<int4> objects,
|
||||||
unsigned int* classified);
|
unsigned int* classified);
|
||||||
|
|
||||||
int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
|
void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
|
||||||
void bindIntegral(DevMem2Di integral);
|
void bindIntegral(DevMem2Di integral);
|
||||||
void unbindIntegral();
|
void unbindIntegral();
|
||||||
}
|
}
|
||||||
@ -294,7 +294,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
|
|||||||
CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U);
|
CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U);
|
||||||
|
|
||||||
const int defaultObjSearchNum = 100;
|
const int defaultObjSearchNum = 100;
|
||||||
const float grouping_eps = 0.2;
|
const float grouping_eps = 0.2f;
|
||||||
|
|
||||||
if( !objects.empty() && objects.depth() == CV_32S)
|
if( !objects.empty() && objects.depth() == CV_32S)
|
||||||
objects.reshape(4, 1);
|
objects.reshape(4, 1);
|
||||||
|
@ -216,10 +216,10 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
struct Classifier
|
struct Classifier
|
||||||
{
|
{
|
||||||
__host__ __device__ __forceinline__ Classifier(const int* _integral, const int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves, const int* _subsets, const uchar4* _features,
|
__host__ __device__ __forceinline__ Classifier(const int* _integral, int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves,
|
||||||
const int _nstages, const int _clWidth, const int _clHeight, const float _scale, const int _step, const int _subsetSize)
|
const int* _subsets, const uchar4* _features, int _nstages, int _clWidth, int _clHeight, float _scale, int _step, int _subsetSize)
|
||||||
: integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), clWidth(_clWidth), clHeight(_clHeight),
|
: integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages),
|
||||||
scale(_scale), step(_step), subsetSize(_subsetSize){}
|
clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){}
|
||||||
|
|
||||||
__device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
|
__device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
|
||||||
{
|
{
|
||||||
@ -255,7 +255,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
rect.z = clWidth;
|
rect.z = clWidth;
|
||||||
rect.w = clHeight;
|
rect.w = clHeight;
|
||||||
|
|
||||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
#if (__CUDA_ARCH__ < 120)
|
||||||
int res = __atomicInc(n, maxN);
|
int res = __atomicInc(n, maxN);
|
||||||
#else
|
#else
|
||||||
int res = atomicInc(n, maxN);
|
int res = atomicInc(n, maxN);
|
||||||
@ -305,7 +305,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
extern __shared__ int sbuff[];
|
extern __shared__ int sbuff[];
|
||||||
|
|
||||||
int* labels = sbuff;
|
int* labels = sbuff;
|
||||||
int* rrects = (int*)(sbuff + n);
|
int* rrects = sbuff + n;
|
||||||
|
|
||||||
Pr predicate(grouping_eps);
|
Pr predicate(grouping_eps);
|
||||||
partition(candidates, n, labels, predicate);
|
partition(candidates, n, labels, predicate);
|
||||||
@ -317,7 +317,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
int cls = labels[tid];
|
int cls = labels[tid];
|
||||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
#if (__CUDA_ARCH__ < 120)
|
||||||
__atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
|
__atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
|
||||||
__atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
|
__atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
|
||||||
__atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
|
__atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
|
||||||
@ -332,7 +332,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
labels[tid] = 0;
|
labels[tid] = 0;
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
#if (__CUDA_ARCH__ < 120)
|
||||||
__atomicInc((unsigned int*)labels + cls, n);
|
__atomicInc((unsigned int*)labels + cls, n);
|
||||||
#else
|
#else
|
||||||
atomicInc((unsigned int*)labels + cls, n);
|
atomicInc((unsigned int*)labels + cls, n);
|
||||||
@ -354,13 +354,10 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
if (active && active >= groupThreshold)
|
if (active && active >= groupThreshold)
|
||||||
{
|
{
|
||||||
int* r1 = rrects + tid * 4;
|
int* r1 = rrects + tid * 4;
|
||||||
int4 r_out;
|
int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
|
||||||
r_out.x = r1[0];
|
|
||||||
r_out.y = r1[1];
|
#if (__CUDA_ARCH__ < 120)
|
||||||
r_out.z = r1[2];
|
|
||||||
r_out.w = r1[3];
|
|
||||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
|
||||||
objects[__atomicInc(nclasses, n)] = r_out;
|
objects[__atomicInc(nclasses, n)] = r_out;
|
||||||
#else
|
#else
|
||||||
int aidx = atomicInc(nclasses, n);
|
int aidx = atomicInc(nclasses, n);
|
||||||
@ -371,21 +368,24 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
void classifyStumpFixed(const DevMem2Di& integral, const int pitch, const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
|
void classifyStumpFixed(const DevMem2Di& integral, const int pitch, const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
|
||||||
const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
|
const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
|
||||||
{
|
{
|
||||||
const int THREADS_BLOCK = 256;
|
Classifier clr(integral, pitch, (Stage*)mstages.ptr(), (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets,
|
||||||
int work_amount = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
|
(uchar4*)mfeatures.ptr(), nstages, clWidth, clHeight, scale, step, subsetSize);
|
||||||
int blocks = divUp(work_amount, THREADS_BLOCK);
|
|
||||||
|
|
||||||
Classifier clr(integral.ptr(), pitch, (Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
|
int total = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
|
||||||
lbp_classify_stump<<<blocks, THREADS_BLOCK>>>(clr, objects, objects.cols, classified, workWidth >> 1);
|
|
||||||
|
int block = 256;
|
||||||
|
int grid = divUp(total, block);
|
||||||
|
lbp_classify_stump<<<grid, block>>>(clr, objects, objects.cols, classified, workWidth >> 1);
|
||||||
|
cudaSafeCall( cudaGetLastError() );
|
||||||
}
|
}
|
||||||
|
|
||||||
int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
|
void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
|
||||||
{
|
{
|
||||||
int threads = ncandidates;
|
int block = ncandidates;
|
||||||
int smem_amount = threads * sizeof(int) + threads * sizeof(int4);
|
int smem = block * ( sizeof(int) + sizeof(int4) );
|
||||||
disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), ncandidates, groupThreshold, grouping_eps, nclasses);
|
disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
|
||||||
return 0;
|
cudaSafeCall( cudaGetLastError() );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}}}
|
}}}
|
@ -49,17 +49,55 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
struct Emulation
|
struct Emulation
|
||||||
{
|
{
|
||||||
static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
|
template<int CTA_SIZE>
|
||||||
|
static __forceinline__ __device__ int Ballot(int predicate)
|
||||||
{
|
{
|
||||||
#if __CUDA_ARCH__ >= 200
|
#if (__CUDA_ARCH__ >= 200)
|
||||||
(void)cta_buffer;
|
|
||||||
return __ballot(predicate);
|
return __ballot(predicate);
|
||||||
#else
|
#else
|
||||||
|
__shared__ volatile int cta_buffer[CTA_SIZE]
|
||||||
|
|
||||||
int tid = threadIdx.x;
|
int tid = threadIdx.x;
|
||||||
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
|
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
|
||||||
return warp_reduce(cta_buffer);
|
return warp_reduce(cta_buffer);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct smem
|
||||||
|
{
|
||||||
|
enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
static __device__ __forceinline__ T atomicInc(T* address, T val)
|
||||||
|
{
|
||||||
|
#if (__CUDA_ARCH__ < 120)
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
static __device__ __forceinline__ void atomicAdd(T* address, T val)
|
||||||
|
{
|
||||||
|
#if (__CUDA_ARCH__ < 120)
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
__device__ __forceinline__ T __atomicMin(T* address, T val)
|
||||||
|
{
|
||||||
|
#if (__CUDA_ARCH__ < 120)
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
};
|
};
|
||||||
}}} // namespace cv { namespace gpu { namespace device
|
}}} // namespace cv { namespace gpu { namespace device
|
||||||
|
|
||||||
|
@ -50,45 +50,46 @@ namespace cv { namespace gpu { namespace device {
|
|||||||
namespace lbp{
|
namespace lbp{
|
||||||
|
|
||||||
#define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
|
#define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
|
||||||
template<typename T>
|
|
||||||
__device__ __forceinline__ T __atomicInc(T* address, T val)
|
template<typename T>
|
||||||
{
|
__device__ __forceinline__ T __atomicInc(T* address, T val)
|
||||||
T count;
|
{
|
||||||
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
|
T count;
|
||||||
do
|
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
|
||||||
{
|
do
|
||||||
count = *address & TAG_MASK;
|
{
|
||||||
count = tag | (count + 1);
|
count = *address & TAG_MASK;
|
||||||
*address = count;
|
count = tag | (count + 1);
|
||||||
} while (*address != count);
|
*address = count;
|
||||||
|
} while (*address != count);
|
||||||
|
|
||||||
return (count & TAG_MASK) - 1;
|
return (count & TAG_MASK) - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
__device__ __forceinline__ void __atomicAdd(T* address, T val)
|
__device__ __forceinline__ void __atomicAdd(T* address, T val)
|
||||||
{
|
{
|
||||||
T count;
|
T count;
|
||||||
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
|
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
count = *address & TAG_MASK;
|
count = *address & TAG_MASK;
|
||||||
count = tag | (count + val);
|
count = tag | (count + val);
|
||||||
*address = count;
|
*address = count;
|
||||||
} while (*address != count);
|
} while (*address != count);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
__device__ __forceinline__ T __atomicMin(T* address, T val)
|
__device__ __forceinline__ T __atomicMin(T* address, T val)
|
||||||
{
|
{
|
||||||
T count = min(*address, val);
|
T count = min(*address, val);
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
*address = count;
|
*address = count;
|
||||||
} while (*address > count);
|
} while (*address > count);
|
||||||
|
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Stage
|
struct Stage
|
||||||
{
|
{
|
||||||
@ -112,7 +113,7 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
|
|||||||
|
|
||||||
__device__ __forceinline__ bool operator()(const int4& r1, const int4& r2) const
|
__device__ __forceinline__ bool operator()(const int4& r1, const int4& r2) const
|
||||||
{
|
{
|
||||||
float delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w)) * 0.5;
|
float delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w)) * 0.5f;
|
||||||
|
|
||||||
return abs(r1.x - r2.x) <= delta && abs(r1.y - r2.y) <= delta
|
return abs(r1.x - r2.x) <= delta && abs(r1.y - r2.y) <= delta
|
||||||
&& abs(r1.x + r1.z - r2.x - r2.z) <= delta && abs(r1.y + r1.w - r2.y - r2.w) <= delta;
|
&& abs(r1.x + r1.z - r2.x - r2.z) <= delta && abs(r1.y + r1.w - r2.y - r2.w) <= delta;
|
||||||
@ -134,22 +135,15 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
|
|||||||
int p = labels[tid];
|
int p = labels[tid];
|
||||||
int q = labels[id];
|
int q = labels[id];
|
||||||
|
|
||||||
if (p < q)
|
if (p != q)
|
||||||
{
|
{
|
||||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
int m = min(p, q);
|
||||||
__atomicMin(labels + id, p);
|
#if (__CUDA_ARCH__ < 120)
|
||||||
|
__atomicMin(labels + id, m);
|
||||||
#else
|
#else
|
||||||
atomicMin(labels + id, p);
|
atomicMin(labels + id, m);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
else if (p > q)
|
|
||||||
{
|
|
||||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
|
||||||
__atomicMin(labels + tid, q);
|
|
||||||
#else
|
|
||||||
atomicMin(labels + tid, q);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
Loading…
Reference in New Issue
Block a user