diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 53a5404d87..0954c7a5c2 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1464,6 +1464,7 @@ private: GpuMat resuzeBuffer; GpuMat candidates; + static const int integralFactor = 4; }; ////////////////////////////////// SURF ////////////////////////////////////////// diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp index 644ce6e5c9..c1ccf61b60 100644 --- a/modules/gpu/src/cascadeclassifier.cpp +++ b/modules/gpu/src/cascadeclassifier.cpp @@ -67,7 +67,7 @@ cv::gpu::CascadeClassifier_GPU_LBP::~CascadeClassifier_GPU_LBP() bool cv::gpu::CascadeClassifier_GPU_LBP::empty() const { throw_nogpu(); return true; } bool cv::gpu::CascadeClassifier_GPU_LBP::load(const string&) { throw_nogpu(); return true; } Size cv::gpu::CascadeClassifier_GPU_LBP::getClassifierSize() const { throw_nogpu(); return Size(); } -void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size /*frame*/) { throw_nogpu();} +void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size /*frame*/) { throw_nogpu();} int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*objectsBuf*/, double /*scaleFactor*/, int /*minNeighbors*/, cv::Size /*maxObjectSize*/){ throw_nogpu(); return 0;} @@ -86,7 +86,7 @@ void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size frame) { resuzeBuffer.create(frame, CV_8UC1); - integral.create(frame.height + 1, frame.width + 1, CV_32SC1); + integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1); NcvSize32u roiSize; roiSize.width = frame.width; roiSize.height = frame.height; @@ -284,14 +284,83 @@ namespace cv { namespace gpu { namespace device DevMem2D_ objects, unsigned int* classified); + void classifyPyramid(int frameW, + int frameH, + int windowW, + int windowH, + float initalScale, + float factor, + int total, + const DevMem2Db& mstages, + const int nstages, + const DevMem2Di& mnodes, + const DevMem2Df& mleaves, + const DevMem2Di& msubsets, + const DevMem2Db& mfeatures, + const int subsetSize, + DevMem2D_ objects, + unsigned int* classified, + DevMem2Di integral); + void connectedConmonents(DevMem2D_ candidates, int ncandidates, DevMem2D_ objects,int groupThreshold, float grouping_eps, unsigned int* nclasses); void bindIntegral(DevMem2Di integral); void unbindIntegral(); } }}} -int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects, - double scaleFactor, int groupThreshold, cv::Size maxObjectSize /*, Size minSize=Size()*/) +cv::Size operator -(const cv::Size& a, const cv::Size& b) +{ + return cv::Size(a.width - b.width, a.height - b.height); +} + +cv::Size operator +(const cv::Size& a, const int& i) +{ + return cv::Size(a.width + i, a.height + i); +} + +cv::Size operator *(const cv::Size& a, const float& f) +{ + return cv::Size(cvRound(a.width * f), cvRound(a.height * f)); +} + +cv::Size operator /(const cv::Size& a, const float& f) +{ + return cv::Size(cvRound(a.width / f), cvRound(a.height / f)); +} + +bool operator <=(const cv::Size& a, const cv::Size& b) +{ + return a.width <= b.width && a.height <= b.width; +} + +struct PyrLavel +{ + PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window) : order(_order) + { + scale = pow(_scale, order); + sFrame = frame / scale; + workArea = sFrame - window + 1; + sWindow = window * scale; + } + + bool isFeasible(cv::Size maxObj) + { + return workArea.width > 0 && workArea.height > 0 && sWindow <= maxObj; + } + + PyrLavel next(float factor, cv::Size frame, cv::Size window) + { + return PyrLavel(order + 1, factor, frame, window); + } + + int order; + float scale; + cv::Size sFrame; + cv::Size workArea; + cv::Size sWindow; +}; + +int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects, double scaleFactor, int groupThreshold, cv::Size maxObjectSize) { CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U); @@ -306,6 +375,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp // used for debug // candidates.setTo(cv::Scalar::all(0)); // objects.setTo(cv::Scalar::all(0)); + if (maxObjectSize == cv::Size()) maxObjectSize = image.size(); @@ -315,52 +385,54 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp GpuMat dclassified(1, 1, CV_32S); cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) ); - // cv::gpu::device::lbp::bindIntegral(integral); + PyrLavel level(0, 1.0f, image.size(), NxM); - Size scaledImageSize(image.cols, image.rows); - Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 ); - Size windowSize(NxM.width, NxM.height); - - float factor = 1; - - for (;;) + while (level.isFeasible(maxObjectSize)) { - if (processingRectSize.width <= 0 || processingRectSize.height <= 0 ) - break; + int acc = level.sFrame.width + 1; + float iniScale = level.scale; + cv::Size area = level.workArea; + float step = (float)(1 + (level.scale <= 2.f)); - if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height ) - break; + int total = 0, prev = 0; - // if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height ) - // continue; + while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize)) + { + // create sutable matrix headers + GpuMat src = resuzeBuffer(cv::Rect(0, 0, level.sFrame.width, level.sFrame.height)); + GpuMat sint = integral(cv::Rect(prev, 0, level.sFrame.width + 1, level.sFrame.height + 1)); + GpuMat buff = integralBuffer; - GpuMat scaledImg = resuzeBuffer(cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height)); - GpuMat scaledIntegral = integral(cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1)); - GpuMat currBuff = integralBuffer; + // generate integral for scale + gpu::resize(image, src, level.sFrame, 0, 0, CV_INTER_LINEAR); + gpu::integralBuffered(src, sint, buff); - gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR); - gpu::integralBuffered(scaledImg, scaledIntegral, currBuff); + total += cvCeil(area.width / step) * cvCeil(area.height / step); + // std::cout << "Total for scale: " << total << " this step contribution " << cvCeil(area.width / step) * cvCeil(area.height / step) << " previous width shift " << prev << " acc " << acc << " scales: " << cvCeil(area.width / step) << std::endl; - int step = factor <= 2.f ? 2 : 1; + // increment pyr lavel + level = level.next(scaleFactor, image.size(), NxM); + area = level.workArea; - device::lbp::classifyStumpFixed(integral, integral.step1(), stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat, - processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified.ptr()); + step = (float)(1 + (level.scale <= 2.f)); + prev = acc; + acc += level.sFrame.width + 1; + } - factor *= scaleFactor; - windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor)); - scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor )); - processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 ); + device::lbp::classifyPyramid(image.cols, image.rows, NxM.width, NxM.height, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, + leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr(), integral); } - // cv::gpu::device::lbp::unbindIntegral(); if (groupThreshold <= 0 || objects.empty()) return 0; cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) ); device::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr()); + // candidates.copyTo(objects); cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaDeviceSynchronize() ); + // std::cout << classified << " !!!!!!!!!!" << std::endl; return classified; } diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu index 42ddd036d2..2667167fbe 100644 --- a/modules/gpu/src/cuda/lbp.cu +++ b/modules/gpu/src/cuda/lbp.cu @@ -216,10 +216,10 @@ namespace cv { namespace gpu { namespace device struct Classifier { - __host__ __device__ __forceinline__ Classifier(const int* _integral, int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves, - const int* _subsets, const uchar4* _features, int _nstages, int _clWidth, int _clHeight, float _scale, int _step, int _subsetSize) - : integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), - clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){} + __host__ __device__ __forceinline__ Classifier(const int* _integral, int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves, + const int* _subsets, const uchar4* _features, int _nstages, int _clWidth, int _clHeight, float _scale, int _step, int _subsetSize) + : integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), + clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){} __device__ __forceinline__ void operator() (int y, int x, DevMem2D_ objects, const unsigned int maxN, unsigned int* n) const { @@ -255,11 +255,7 @@ namespace cv { namespace gpu { namespace device rect.z = clWidth; rect.w = clHeight; -#if (__CUDA_ARCH__ < 120) - int res = __atomicInc(n, maxN); -#else - int res = atomicInc(n, maxN); -#endif + int res = Emulation::smem::atomicInc(n, maxN); objects(0, res) = rect; } @@ -317,26 +313,17 @@ namespace cv { namespace gpu { namespace device __syncthreads(); int cls = labels[tid]; -#if (__CUDA_ARCH__ < 120) - __atomicAdd((rrects + cls * 4 + 0), candidates[tid].x); - __atomicAdd((rrects + cls * 4 + 1), candidates[tid].y); - __atomicAdd((rrects + cls * 4 + 2), candidates[tid].z); - __atomicAdd((rrects + cls * 4 + 3), candidates[tid].w); -#else - atomicAdd((rrects + cls * 4 + 0), candidates[tid].x); - atomicAdd((rrects + cls * 4 + 1), candidates[tid].y); - atomicAdd((rrects + cls * 4 + 2), candidates[tid].z); - atomicAdd((rrects + cls * 4 + 3), candidates[tid].w); -#endif + Emulation::smem::atomicAdd((rrects + cls * 4 + 0), candidates[tid].x); + Emulation::smem::atomicAdd((rrects + cls * 4 + 1), candidates[tid].y); + Emulation::smem::atomicAdd((rrects + cls * 4 + 2), candidates[tid].z); + Emulation::smem::atomicAdd((rrects + cls * 4 + 3), candidates[tid].w); + __syncthreads(); labels[tid] = 0; __syncthreads(); -#if (__CUDA_ARCH__ < 120) - __atomicInc((unsigned int*)labels + cls, n); -#else - atomicInc((unsigned int*)labels + cls, n); -#endif + Emulation::smem::atomicInc((unsigned int*)labels + cls, n); + __syncthreads(); *nclasses = 0; @@ -354,30 +341,26 @@ namespace cv { namespace gpu { namespace device if (active && active >= groupThreshold) { - int* r1 = rrects + tid * 4; - int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]); + int* r1 = rrects + tid * 4; + int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]); -#if (__CUDA_ARCH__ < 120) - objects[__atomicInc(nclasses, n)] = r_out; -#else - int aidx = atomicInc(nclasses, n); + int aidx = Emulation::smem::atomicInc(nclasses, n); objects[aidx] = r_out; -#endif } } void classifyStumpFixed(const DevMem2Di& integral, const int pitch, const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_ objects, unsigned int* classified) - { - Classifier clr(integral, pitch, (Stage*)mstages.ptr(), (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets, - (uchar4*)mfeatures.ptr(), nstages, clWidth, clHeight, scale, step, subsetSize); + { + Classifier clr(integral, pitch, (Stage*)mstages.ptr(), (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets, + (uchar4*)mfeatures.ptr(), nstages, clWidth, clHeight, scale, step, subsetSize); - int total = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step); + int total = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step); - int block = 256; + int block = 256; int grid = divUp(total, block); lbp_classify_stump<<>>(clr, objects, objects.cols, classified, workWidth >> 1); - cudaSafeCall( cudaGetLastError() ); + cudaSafeCall( cudaGetLastError() ); } void connectedConmonents(DevMem2D_ candidates, int ncandidates, DevMem2D_ objects, int groupThreshold, float grouping_eps, unsigned int* nclasses) @@ -385,7 +368,124 @@ namespace cv { namespace gpu { namespace device int block = ncandidates; int smem = block * ( sizeof(int) + sizeof(int4) ); disjoin<<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses); - cudaSafeCall( cudaGetLastError() ); + cudaSafeCall( cudaGetLastError() ); + } + + struct Cascade + { + __host__ __device__ __forceinline__ Cascade(const Stage* _stages, int _nstages, const ClNode* _nodes, const float* _leaves, + const int* _subsets, const uchar4* _features, int _subsetSize) + + : stages(_stages), nstages(_nstages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), subsetSize(_subsetSize){} + + __device__ __forceinline__ bool operator() (int y, int x, int* integral, const int pitch/*, DevMem2D_ objects, const unsigned int maxN, unsigned int* n*/) const + { + int current_node = 0; + int current_leave = 0; + + for (int s = 0; s < nstages; ++s) + { + float sum = 0; + Stage stage = stages[s]; + for (int t = 0; t < stage.ntrees; t++) + { + ClNode node = nodes[current_node]; + uchar4 feature = features[node.featureIdx]; + + int shift; + int c = evaluator(integral, (y + feature.y) * pitch + x + feature.x, feature.w * pitch, feature.z, shift); + int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1; + sum += leaves[idx]; + + current_node += 1; + current_leave += 2; + } + + if (sum < stage.threshold) + return false; + } + + return true; + } + + const Stage* stages; + const int nstages; + + const ClNode* nodes; + const float* leaves; + const int* subsets; + const uchar4* features; + + const int subsetSize; + const LBP evaluator; + }; + + // stepShift, scale, width_k, sum_prev => y = sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k + __global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor, + const int workAmount, int* integral, const int pitch, DevMem2D_ objects, unsigned int* classified) + { + int ftid = blockIdx.x * blockDim.x + threadIdx.x; + if (ftid >= workAmount ) return; + + int sum = 0; + // float scale = 1.0f; + float stepShift = (scale <= 2.f) ? 2.0 : 1.0; + int w = ceilf( ( __float2int_rn(frameW / scale) - windowW + 1) / stepShift); + int h = ceilf( ( __float2int_rn(frameH / scale) - windowH + 1) / stepShift); + + // if (!ftid) + // printf("!!!!: %d %d", w, h); + + int framTid = ftid; + int i = 0; + + while (1) + { + if (framTid < (w - 1) * (h - 1)) break; + i++; + sum += __float2int_rn(frameW / scale) + 1; + framTid -= w * h; + scale *= factor; + stepShift = (scale <= 2.f) ? 2.0 : 1.0; + int w = ceilf( ( __float2int_rn(frameW / scale) - windowW + 1) / stepShift); + int h = ceilf( ( __float2int_rn(frameH / scale) - windowH + 1) / stepShift); + } + + int y = (framTid / w); + int x = (framTid - y * w) * stepShift; + y *= stepShift; + x += sum; + + // if (i == 2) + // printf("!!!!!!!!!!!!!! %f %d %d %d\n", windowW * scale, sum, y, x); + + if (cascade(y, x, integral, pitch)) + { + int4 rect; + rect.x = roundf( (x - sum) * scale); + rect.y = roundf(y * scale); + rect.z = roundf(windowW * scale); + rect.w = roundf(windowH * scale); + + if (rect.x > frameW || rect.y > frameH) return; + // printf("OUTLAUER %d %d %d %d %d %d %d %d %d %f %f\n", x, y, ftid, framTid, rect.x, rect.y, sum, w, h, stepShift, scale); + + // printf("passed: %d %d ---- %d %d %d %d %d\n", y, x, rect.x, rect.y, rect.z, rect.w, sum); + + int res = Emulation::smem::atomicInc(classified, (unsigned int)objects.cols); + objects(0, res) = rect; + + } + } + + void classifyPyramid(int frameW, int frameH, int windowW, int windowH, float initialScale, float factor, int workAmount, + const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures, + const int subsetSize, DevMem2D_ objects, unsigned int* classified, DevMem2Di integral) + { + const int block = 256; + int grid = divUp(workAmount, block); + Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize); + lbp_cascade<<>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), integral.step / sizeof(int), objects, classified); } } }}} \ No newline at end of file diff --git a/modules/gpu/src/opencv2/gpu/device/emulation.hpp b/modules/gpu/src/opencv2/gpu/device/emulation.hpp index f3923a358f..fe5452b5cd 100644 --- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp +++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp @@ -44,18 +44,19 @@ #define OPENCV_GPU_EMULATION_HPP_ #include "warp_reduce.hpp" +#include namespace cv { namespace gpu { namespace device { struct Emulation { - template + template static __forceinline__ __device__ int Ballot(int predicate) { -#if (__CUDA_ARCH__ >= 200) +#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) return __ballot(predicate); #else - __shared__ volatile int cta_buffer[CTA_SIZE] + __shared__ volatile int cta_buffer[CTA_SIZE]; int tid = threadIdx.x; cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0; @@ -63,41 +64,62 @@ namespace cv { namespace gpu { namespace device #endif } - struct smem - { - enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U }; - - template - static __device__ __forceinline__ T atomicInc(T* address, T val) - { -#if (__CUDA_ARCH__ < 120) - -#else - -#endif - - } - - template - static __device__ __forceinline__ void atomicAdd(T* address, T val) - { -#if (__CUDA_ARCH__ < 120) - -#else - -#endif - } - - template - __device__ __forceinline__ T __atomicMin(T* address, T val) - { -#if (__CUDA_ARCH__ < 120) - -#else - -#endif - } - }; + struct smem + { + enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U }; + + template + static __device__ __forceinline__ T atomicInc(T* address, T val) + { +#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) + T count; + unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U); + do + { + count = *address & TAG_MASK; + count = tag | (count + 1); + *address = count; + } while (*address != count); + + return (count & TAG_MASK) - 1; +#else + return ::atomicInc(address, val); +#endif + } + + template + static __device__ __forceinline__ void atomicAdd(T* address, T val) + { +#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) + T count; + unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U); + do + { + count = *address & TAG_MASK; + count = tag | (count + val); + *address = count; + } while (*address != count); +#else + ::atomicAdd(address, val); +#endif + } + + template + static __device__ __forceinline__ T atomicMin(T* address, T val) + { +#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) + T count = min(*address, val); + do + { + *address = count; + } while (*address > count); + + return count; +#else + return ::atomicMin(address, val); +#endif + } + }; }; }}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/opencv2/gpu/device/lbp.hpp b/modules/gpu/src/opencv2/gpu/device/lbp.hpp index 8a7aa0eb92..0c8a03e334 100644 --- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp +++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp @@ -44,52 +44,11 @@ #define __OPENCV_GPU_DEVICE_LBP_HPP_ #include "internal_shared.hpp" +#include namespace cv { namespace gpu { namespace device { -namespace lbp{ - - #define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U ) - - template - __device__ __forceinline__ T __atomicInc(T* address, T val) - { - T count; - unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U); - do - { - count = *address & TAG_MASK; - count = tag | (count + 1); - *address = count; - } while (*address != count); - - return (count & TAG_MASK) - 1; - } - - template - __device__ __forceinline__ void __atomicAdd(T* address, T val) - { - T count; - unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U); - do - { - count = *address & TAG_MASK; - count = tag | (count + val); - *address = count; - } while (*address != count); - } - - template - __device__ __forceinline__ T __atomicMin(T* address, T val) - { - T count = min(*address, val); - do - { - *address = count; - } while (*address > count); - - return count; - } +namespace lbp { struct Stage { @@ -127,27 +86,25 @@ namespace lbp{ unsigned tid = threadIdx.x; labels[tid] = tid; __syncthreads(); - for (unsigned int id = 0; id < n; id++) { if (tid != id && predicate(vec[tid], vec[id])) { int p = labels[tid]; int q = labels[id]; - - if (p != q) - { - int m = min(p, q); -#if (__CUDA_ARCH__ < 120) - __atomicMin(labels + id, m); -#else - atomicMin(labels + id, m); -#endif - } + if (p < q) + { + Emulation::smem::atomicMin(labels + id, p); + } + else if (p > q) + { + Emulation::smem::atomicMin(labels + tid, q); + } } } __syncthreads(); } + } // lbp } } }// namespaces