mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 22:44:02 +08:00
LBP: multiscale approach; refactored atomics usage
This commit is contained in:
parent
5dc7752d54
commit
e63ab8dec5
@ -1464,6 +1464,7 @@ private:
|
||||
GpuMat resuzeBuffer;
|
||||
|
||||
GpuMat candidates;
|
||||
static const int integralFactor = 4;
|
||||
};
|
||||
|
||||
////////////////////////////////// SURF //////////////////////////////////////////
|
||||
|
@ -86,7 +86,7 @@ void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size frame)
|
||||
{
|
||||
resuzeBuffer.create(frame, CV_8UC1);
|
||||
|
||||
integral.create(frame.height + 1, frame.width + 1, CV_32SC1);
|
||||
integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);
|
||||
NcvSize32u roiSize;
|
||||
roiSize.width = frame.width;
|
||||
roiSize.height = frame.height;
|
||||
@ -284,14 +284,83 @@ namespace cv { namespace gpu { namespace device
|
||||
DevMem2D_<int4> objects,
|
||||
unsigned int* classified);
|
||||
|
||||
void classifyPyramid(int frameW,
|
||||
int frameH,
|
||||
int windowW,
|
||||
int windowH,
|
||||
float initalScale,
|
||||
float factor,
|
||||
int total,
|
||||
const DevMem2Db& mstages,
|
||||
const int nstages,
|
||||
const DevMem2Di& mnodes,
|
||||
const DevMem2Df& mleaves,
|
||||
const DevMem2Di& msubsets,
|
||||
const DevMem2Db& mfeatures,
|
||||
const int subsetSize,
|
||||
DevMem2D_<int4> objects,
|
||||
unsigned int* classified,
|
||||
DevMem2Di integral);
|
||||
|
||||
void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
|
||||
void bindIntegral(DevMem2Di integral);
|
||||
void unbindIntegral();
|
||||
}
|
||||
}}}
|
||||
|
||||
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects,
|
||||
double scaleFactor, int groupThreshold, cv::Size maxObjectSize /*, Size minSize=Size()*/)
|
||||
cv::Size operator -(const cv::Size& a, const cv::Size& b)
|
||||
{
|
||||
return cv::Size(a.width - b.width, a.height - b.height);
|
||||
}
|
||||
|
||||
cv::Size operator +(const cv::Size& a, const int& i)
|
||||
{
|
||||
return cv::Size(a.width + i, a.height + i);
|
||||
}
|
||||
|
||||
cv::Size operator *(const cv::Size& a, const float& f)
|
||||
{
|
||||
return cv::Size(cvRound(a.width * f), cvRound(a.height * f));
|
||||
}
|
||||
|
||||
cv::Size operator /(const cv::Size& a, const float& f)
|
||||
{
|
||||
return cv::Size(cvRound(a.width / f), cvRound(a.height / f));
|
||||
}
|
||||
|
||||
bool operator <=(const cv::Size& a, const cv::Size& b)
|
||||
{
|
||||
return a.width <= b.width && a.height <= b.width;
|
||||
}
|
||||
|
||||
struct PyrLavel
|
||||
{
|
||||
PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window) : order(_order)
|
||||
{
|
||||
scale = pow(_scale, order);
|
||||
sFrame = frame / scale;
|
||||
workArea = sFrame - window + 1;
|
||||
sWindow = window * scale;
|
||||
}
|
||||
|
||||
bool isFeasible(cv::Size maxObj)
|
||||
{
|
||||
return workArea.width > 0 && workArea.height > 0 && sWindow <= maxObj;
|
||||
}
|
||||
|
||||
PyrLavel next(float factor, cv::Size frame, cv::Size window)
|
||||
{
|
||||
return PyrLavel(order + 1, factor, frame, window);
|
||||
}
|
||||
|
||||
int order;
|
||||
float scale;
|
||||
cv::Size sFrame;
|
||||
cv::Size workArea;
|
||||
cv::Size sWindow;
|
||||
};
|
||||
|
||||
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects, double scaleFactor, int groupThreshold, cv::Size maxObjectSize)
|
||||
{
|
||||
CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U);
|
||||
|
||||
@ -306,6 +375,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
|
||||
// used for debug
|
||||
// candidates.setTo(cv::Scalar::all(0));
|
||||
// objects.setTo(cv::Scalar::all(0));
|
||||
|
||||
if (maxObjectSize == cv::Size())
|
||||
maxObjectSize = image.size();
|
||||
|
||||
@ -315,52 +385,54 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
|
||||
GpuMat dclassified(1, 1, CV_32S);
|
||||
cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) );
|
||||
|
||||
// cv::gpu::device::lbp::bindIntegral(integral);
|
||||
PyrLavel level(0, 1.0f, image.size(), NxM);
|
||||
|
||||
Size scaledImageSize(image.cols, image.rows);
|
||||
Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
|
||||
Size windowSize(NxM.width, NxM.height);
|
||||
|
||||
float factor = 1;
|
||||
|
||||
for (;;)
|
||||
while (level.isFeasible(maxObjectSize))
|
||||
{
|
||||
if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )
|
||||
break;
|
||||
int acc = level.sFrame.width + 1;
|
||||
float iniScale = level.scale;
|
||||
cv::Size area = level.workArea;
|
||||
float step = (float)(1 + (level.scale <= 2.f));
|
||||
|
||||
if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height )
|
||||
break;
|
||||
int total = 0, prev = 0;
|
||||
|
||||
// if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )
|
||||
// continue;
|
||||
while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize))
|
||||
{
|
||||
// create sutable matrix headers
|
||||
GpuMat src = resuzeBuffer(cv::Rect(0, 0, level.sFrame.width, level.sFrame.height));
|
||||
GpuMat sint = integral(cv::Rect(prev, 0, level.sFrame.width + 1, level.sFrame.height + 1));
|
||||
GpuMat buff = integralBuffer;
|
||||
|
||||
GpuMat scaledImg = resuzeBuffer(cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height));
|
||||
GpuMat scaledIntegral = integral(cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1));
|
||||
GpuMat currBuff = integralBuffer;
|
||||
// generate integral for scale
|
||||
gpu::resize(image, src, level.sFrame, 0, 0, CV_INTER_LINEAR);
|
||||
gpu::integralBuffered(src, sint, buff);
|
||||
|
||||
gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR);
|
||||
gpu::integralBuffered(scaledImg, scaledIntegral, currBuff);
|
||||
total += cvCeil(area.width / step) * cvCeil(area.height / step);
|
||||
// std::cout << "Total for scale: " << total << " this step contribution " << cvCeil(area.width / step) * cvCeil(area.height / step) << " previous width shift " << prev << " acc " << acc << " scales: " << cvCeil(area.width / step) << std::endl;
|
||||
|
||||
int step = factor <= 2.f ? 2 : 1;
|
||||
// increment pyr lavel
|
||||
level = level.next(scaleFactor, image.size(), NxM);
|
||||
area = level.workArea;
|
||||
|
||||
device::lbp::classifyStumpFixed(integral, integral.step1(), stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
|
||||
processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified.ptr<unsigned int>());
|
||||
|
||||
factor *= scaleFactor;
|
||||
windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
|
||||
scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
|
||||
processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
|
||||
step = (float)(1 + (level.scale <= 2.f));
|
||||
prev = acc;
|
||||
acc += level.sFrame.width + 1;
|
||||
}
|
||||
|
||||
device::lbp::classifyPyramid(image.cols, image.rows, NxM.width, NxM.height, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
|
||||
leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr<unsigned int>(), integral);
|
||||
}
|
||||
|
||||
// cv::gpu::device::lbp::unbindIntegral();
|
||||
if (groupThreshold <= 0 || objects.empty())
|
||||
return 0;
|
||||
|
||||
cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
|
||||
device::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());
|
||||
|
||||
// candidates.copyTo(objects);
|
||||
cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
// std::cout << classified << " !!!!!!!!!!" << std::endl;
|
||||
|
||||
return classified;
|
||||
}
|
||||
|
@ -255,11 +255,7 @@ namespace cv { namespace gpu { namespace device
|
||||
rect.z = clWidth;
|
||||
rect.w = clHeight;
|
||||
|
||||
#if (__CUDA_ARCH__ < 120)
|
||||
int res = __atomicInc(n, maxN);
|
||||
#else
|
||||
int res = atomicInc(n, maxN);
|
||||
#endif
|
||||
int res = Emulation::smem::atomicInc(n, maxN);
|
||||
objects(0, res) = rect;
|
||||
}
|
||||
|
||||
@ -317,26 +313,17 @@ namespace cv { namespace gpu { namespace device
|
||||
__syncthreads();
|
||||
|
||||
int cls = labels[tid];
|
||||
#if (__CUDA_ARCH__ < 120)
|
||||
__atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
|
||||
__atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
|
||||
__atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
|
||||
__atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
|
||||
#else
|
||||
atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
|
||||
atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
|
||||
atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
|
||||
atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
|
||||
#endif
|
||||
Emulation::smem::atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
|
||||
Emulation::smem::atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
|
||||
Emulation::smem::atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
|
||||
Emulation::smem::atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
|
||||
|
||||
__syncthreads();
|
||||
labels[tid] = 0;
|
||||
|
||||
__syncthreads();
|
||||
#if (__CUDA_ARCH__ < 120)
|
||||
__atomicInc((unsigned int*)labels + cls, n);
|
||||
#else
|
||||
atomicInc((unsigned int*)labels + cls, n);
|
||||
#endif
|
||||
Emulation::smem::atomicInc((unsigned int*)labels + cls, n);
|
||||
|
||||
__syncthreads();
|
||||
*nclasses = 0;
|
||||
|
||||
@ -357,12 +344,8 @@ namespace cv { namespace gpu { namespace device
|
||||
int* r1 = rrects + tid * 4;
|
||||
int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
|
||||
|
||||
#if (__CUDA_ARCH__ < 120)
|
||||
objects[__atomicInc(nclasses, n)] = r_out;
|
||||
#else
|
||||
int aidx = atomicInc(nclasses, n);
|
||||
int aidx = Emulation::smem::atomicInc(nclasses, n);
|
||||
objects[aidx] = r_out;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -387,5 +370,122 @@ namespace cv { namespace gpu { namespace device
|
||||
disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
|
||||
struct Cascade
|
||||
{
|
||||
__host__ __device__ __forceinline__ Cascade(const Stage* _stages, int _nstages, const ClNode* _nodes, const float* _leaves,
|
||||
const int* _subsets, const uchar4* _features, int _subsetSize)
|
||||
|
||||
: stages(_stages), nstages(_nstages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), subsetSize(_subsetSize){}
|
||||
|
||||
__device__ __forceinline__ bool operator() (int y, int x, int* integral, const int pitch/*, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n*/) const
|
||||
{
|
||||
int current_node = 0;
|
||||
int current_leave = 0;
|
||||
|
||||
for (int s = 0; s < nstages; ++s)
|
||||
{
|
||||
float sum = 0;
|
||||
Stage stage = stages[s];
|
||||
for (int t = 0; t < stage.ntrees; t++)
|
||||
{
|
||||
ClNode node = nodes[current_node];
|
||||
uchar4 feature = features[node.featureIdx];
|
||||
|
||||
int shift;
|
||||
int c = evaluator(integral, (y + feature.y) * pitch + x + feature.x, feature.w * pitch, feature.z, shift);
|
||||
int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
|
||||
sum += leaves[idx];
|
||||
|
||||
current_node += 1;
|
||||
current_leave += 2;
|
||||
}
|
||||
|
||||
if (sum < stage.threshold)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
const Stage* stages;
|
||||
const int nstages;
|
||||
|
||||
const ClNode* nodes;
|
||||
const float* leaves;
|
||||
const int* subsets;
|
||||
const uchar4* features;
|
||||
|
||||
const int subsetSize;
|
||||
const LBP evaluator;
|
||||
};
|
||||
|
||||
// stepShift, scale, width_k, sum_prev => y = sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k
|
||||
__global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor,
|
||||
const int workAmount, int* integral, const int pitch, DevMem2D_<int4> objects, unsigned int* classified)
|
||||
{
|
||||
int ftid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (ftid >= workAmount ) return;
|
||||
|
||||
int sum = 0;
|
||||
// float scale = 1.0f;
|
||||
float stepShift = (scale <= 2.f) ? 2.0 : 1.0;
|
||||
int w = ceilf( ( __float2int_rn(frameW / scale) - windowW + 1) / stepShift);
|
||||
int h = ceilf( ( __float2int_rn(frameH / scale) - windowH + 1) / stepShift);
|
||||
|
||||
// if (!ftid)
|
||||
// printf("!!!!: %d %d", w, h);
|
||||
|
||||
int framTid = ftid;
|
||||
int i = 0;
|
||||
|
||||
while (1)
|
||||
{
|
||||
if (framTid < (w - 1) * (h - 1)) break;
|
||||
i++;
|
||||
sum += __float2int_rn(frameW / scale) + 1;
|
||||
framTid -= w * h;
|
||||
scale *= factor;
|
||||
stepShift = (scale <= 2.f) ? 2.0 : 1.0;
|
||||
int w = ceilf( ( __float2int_rn(frameW / scale) - windowW + 1) / stepShift);
|
||||
int h = ceilf( ( __float2int_rn(frameH / scale) - windowH + 1) / stepShift);
|
||||
}
|
||||
|
||||
int y = (framTid / w);
|
||||
int x = (framTid - y * w) * stepShift;
|
||||
y *= stepShift;
|
||||
x += sum;
|
||||
|
||||
// if (i == 2)
|
||||
// printf("!!!!!!!!!!!!!! %f %d %d %d\n", windowW * scale, sum, y, x);
|
||||
|
||||
if (cascade(y, x, integral, pitch))
|
||||
{
|
||||
int4 rect;
|
||||
rect.x = roundf( (x - sum) * scale);
|
||||
rect.y = roundf(y * scale);
|
||||
rect.z = roundf(windowW * scale);
|
||||
rect.w = roundf(windowH * scale);
|
||||
|
||||
if (rect.x > frameW || rect.y > frameH) return;
|
||||
// printf("OUTLAUER %d %d %d %d %d %d %d %d %d %f %f\n", x, y, ftid, framTid, rect.x, rect.y, sum, w, h, stepShift, scale);
|
||||
|
||||
// printf("passed: %d %d ---- %d %d %d %d %d\n", y, x, rect.x, rect.y, rect.z, rect.w, sum);
|
||||
|
||||
int res = Emulation::smem::atomicInc(classified, (unsigned int)objects.cols);
|
||||
objects(0, res) = rect;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void classifyPyramid(int frameW, int frameH, int windowW, int windowH, float initialScale, float factor, int workAmount,
|
||||
const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
|
||||
const int subsetSize, DevMem2D_<int4> objects, unsigned int* classified, DevMem2Di integral)
|
||||
{
|
||||
const int block = 256;
|
||||
int grid = divUp(workAmount, block);
|
||||
Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize);
|
||||
lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), integral.step / sizeof(int), objects, classified);
|
||||
}
|
||||
}
|
||||
}}}
|
@ -44,6 +44,7 @@
|
||||
#define OPENCV_GPU_EMULATION_HPP_
|
||||
|
||||
#include "warp_reduce.hpp"
|
||||
#include <stdio.h>
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
@ -52,10 +53,10 @@ namespace cv { namespace gpu { namespace device
|
||||
template<int CTA_SIZE>
|
||||
static __forceinline__ __device__ int Ballot(int predicate)
|
||||
{
|
||||
#if (__CUDA_ARCH__ >= 200)
|
||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
|
||||
return __ballot(predicate);
|
||||
#else
|
||||
__shared__ volatile int cta_buffer[CTA_SIZE]
|
||||
__shared__ volatile int cta_buffer[CTA_SIZE];
|
||||
|
||||
int tid = threadIdx.x;
|
||||
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
|
||||
@ -70,31 +71,52 @@ namespace cv { namespace gpu { namespace device
|
||||
template<typename T>
|
||||
static __device__ __forceinline__ T atomicInc(T* address, T val)
|
||||
{
|
||||
#if (__CUDA_ARCH__ < 120)
|
||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
||||
T count;
|
||||
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
|
||||
do
|
||||
{
|
||||
count = *address & TAG_MASK;
|
||||
count = tag | (count + 1);
|
||||
*address = count;
|
||||
} while (*address != count);
|
||||
|
||||
return (count & TAG_MASK) - 1;
|
||||
#else
|
||||
|
||||
return ::atomicInc(address, val);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __device__ __forceinline__ void atomicAdd(T* address, T val)
|
||||
{
|
||||
#if (__CUDA_ARCH__ < 120)
|
||||
|
||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
||||
T count;
|
||||
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
|
||||
do
|
||||
{
|
||||
count = *address & TAG_MASK;
|
||||
count = tag | (count + val);
|
||||
*address = count;
|
||||
} while (*address != count);
|
||||
#else
|
||||
|
||||
::atomicAdd(address, val);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__device__ __forceinline__ T __atomicMin(T* address, T val)
|
||||
static __device__ __forceinline__ T atomicMin(T* address, T val)
|
||||
{
|
||||
#if (__CUDA_ARCH__ < 120)
|
||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
||||
T count = min(*address, val);
|
||||
do
|
||||
{
|
||||
*address = count;
|
||||
} while (*address > count);
|
||||
|
||||
return count;
|
||||
#else
|
||||
|
||||
return ::atomicMin(address, val);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
@ -44,53 +44,12 @@
|
||||
#define __OPENCV_GPU_DEVICE_LBP_HPP_
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include <opencv2/gpu/device/emulation.hpp>
|
||||
|
||||
namespace cv { namespace gpu { namespace device {
|
||||
|
||||
namespace lbp {
|
||||
|
||||
#define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
|
||||
|
||||
template<typename T>
|
||||
__device__ __forceinline__ T __atomicInc(T* address, T val)
|
||||
{
|
||||
T count;
|
||||
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
|
||||
do
|
||||
{
|
||||
count = *address & TAG_MASK;
|
||||
count = tag | (count + 1);
|
||||
*address = count;
|
||||
} while (*address != count);
|
||||
|
||||
return (count & TAG_MASK) - 1;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__device__ __forceinline__ void __atomicAdd(T* address, T val)
|
||||
{
|
||||
T count;
|
||||
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
|
||||
do
|
||||
{
|
||||
count = *address & TAG_MASK;
|
||||
count = tag | (count + val);
|
||||
*address = count;
|
||||
} while (*address != count);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__device__ __forceinline__ T __atomicMin(T* address, T val)
|
||||
{
|
||||
T count = min(*address, val);
|
||||
do
|
||||
{
|
||||
*address = count;
|
||||
} while (*address > count);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
struct Stage
|
||||
{
|
||||
int first;
|
||||
@ -127,27 +86,25 @@ namespace lbp{
|
||||
unsigned tid = threadIdx.x;
|
||||
labels[tid] = tid;
|
||||
__syncthreads();
|
||||
|
||||
for (unsigned int id = 0; id < n; id++)
|
||||
{
|
||||
if (tid != id && predicate(vec[tid], vec[id]))
|
||||
{
|
||||
int p = labels[tid];
|
||||
int q = labels[id];
|
||||
|
||||
if (p != q)
|
||||
if (p < q)
|
||||
{
|
||||
int m = min(p, q);
|
||||
#if (__CUDA_ARCH__ < 120)
|
||||
__atomicMin(labels + id, m);
|
||||
#else
|
||||
atomicMin(labels + id, m);
|
||||
#endif
|
||||
Emulation::smem::atomicMin(labels + id, p);
|
||||
}
|
||||
else if (p > q)
|
||||
{
|
||||
Emulation::smem::atomicMin(labels + tid, q);
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
} // lbp
|
||||
|
||||
} } }// namespaces
|
||||
|
Loading…
Reference in New Issue
Block a user