mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 06:26:29 +08:00
use fast integral for soft cascade
This commit is contained in:
parent
ac5cd48279
commit
df392cc830
@ -383,6 +383,89 @@ namespace cv { namespace gpu { namespace device
|
|||||||
if (stream == 0)
|
if (stream == 0)
|
||||||
cudaSafeCall( cudaDeviceSynchronize() );
|
cudaSafeCall( cudaDeviceSynchronize() );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__ void shfl_integral_vertical(PtrStepSz<unsigned int> buffer, PtrStepSz<unsigned int> integral)
|
||||||
|
{
|
||||||
|
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
|
||||||
|
__shared__ unsigned int sums[32][9];
|
||||||
|
|
||||||
|
const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
const int lane_id = tidx % 8;
|
||||||
|
|
||||||
|
if (tidx >= integral.cols)
|
||||||
|
return;
|
||||||
|
|
||||||
|
sums[threadIdx.x][threadIdx.y] = 0;
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
unsigned int stepSum = 0;
|
||||||
|
|
||||||
|
for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
|
||||||
|
{
|
||||||
|
unsigned int* p = buffer.ptr(y) + tidx;
|
||||||
|
unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
|
||||||
|
|
||||||
|
unsigned int sum = *p;
|
||||||
|
|
||||||
|
sums[threadIdx.x][threadIdx.y] = sum;
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// place into SMEM
|
||||||
|
// shfl scan reduce the SMEM, reformating so the column
|
||||||
|
// sums are computed in a warp
|
||||||
|
// then read out properly
|
||||||
|
const int j = threadIdx.x % 8;
|
||||||
|
const int k = threadIdx.x / 8 + threadIdx.y * 4;
|
||||||
|
|
||||||
|
int partial_sum = sums[k][j];
|
||||||
|
|
||||||
|
for (int i = 1; i <= 8; i *= 2)
|
||||||
|
{
|
||||||
|
int n = __shfl_up(partial_sum, i, 32);
|
||||||
|
|
||||||
|
if (lane_id >= i)
|
||||||
|
partial_sum += n;
|
||||||
|
}
|
||||||
|
|
||||||
|
sums[k][j] = partial_sum;
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (threadIdx.y > 0)
|
||||||
|
sum += sums[threadIdx.x][threadIdx.y - 1];
|
||||||
|
|
||||||
|
sum += stepSum;
|
||||||
|
stepSum += sums[threadIdx.x][blockDim.y - 1];
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
*dst = sum;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
|
||||||
|
// ToDo: partial dy
|
||||||
|
void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz<uint4> buffer, PtrStepSz<unsigned int> integral,
|
||||||
|
int blockStep, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
{
|
||||||
|
const int block = blockStep;
|
||||||
|
const int grid = img.rows;
|
||||||
|
|
||||||
|
cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
|
||||||
|
|
||||||
|
shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, buffer);
|
||||||
|
cudaSafeCall( cudaGetLastError() );
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const dim3 block(32, 8);
|
||||||
|
const dim3 grid(divUp(integral.cols, block.x), 1);
|
||||||
|
|
||||||
|
shfl_integral_vertical<<<grid, block, 0, stream>>>((PtrStepSz<uint>)buffer, integral);
|
||||||
|
cudaSafeCall( cudaGetLastError() );
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}}}
|
}}}
|
||||||
|
|
||||||
|
@ -198,14 +198,14 @@ namespace icf {
|
|||||||
Node node = nodes[nId];
|
Node node = nodes[nId];
|
||||||
|
|
||||||
float threshold = rescale<isUp>(level, node);
|
float threshold = rescale<isUp>(level, node);
|
||||||
int sum = get<isUp>(x, y + (node.threshold >> 28) * 121, node.rect);
|
int sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
|
||||||
|
|
||||||
int next = 1 + (int)(sum >= threshold);
|
int next = 1 + (int)(sum >= threshold);
|
||||||
dprintf("%d: go: %d (%d >= %f)\n\n" ,threadIdx.x, next, sum, threshold);
|
dprintf("%d: go: %d (%d >= %f)\n\n" ,threadIdx.x, next, sum, threshold);
|
||||||
|
|
||||||
node = nodes[nId + next];
|
node = nodes[nId + next];
|
||||||
threshold = rescale<isUp>(level, node);
|
threshold = rescale<isUp>(level, node);
|
||||||
sum = get<isUp>(x, y + (node.threshold >> 28) * 121, node.rect);
|
sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
|
||||||
|
|
||||||
const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
|
const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
|
||||||
float impact = leaves[(st + threadIdx.x) * 4 + lShift];
|
float impact = leaves[(st + threadIdx.x) * 4 + lShift];
|
||||||
|
@ -76,14 +76,20 @@ cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale
|
|||||||
}
|
}
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device {
|
namespace cv { namespace gpu { namespace device {
|
||||||
|
|
||||||
namespace icf {
|
namespace icf {
|
||||||
void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
|
void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
|
||||||
const int fw, const int fh, const int bins);
|
const int fw, const int fh, const int bins);
|
||||||
}
|
}
|
||||||
namespace imgproc
|
|
||||||
{
|
namespace imgproc {
|
||||||
void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
|
void shfl_integral_gpu_buffered(PtrStepSzb, PtrStepSz<uint4>, PtrStepSz<unsigned int>, int, cudaStream_t);
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
|
||||||
|
PtrStepSzb dst, int interpolation, cudaStream_t stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
}}}
|
}}}
|
||||||
|
|
||||||
struct cv::gpu::SoftCascade::Filds
|
struct cv::gpu::SoftCascade::Filds
|
||||||
@ -319,9 +325,13 @@ struct cv::gpu::SoftCascade::Filds
|
|||||||
plane.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1);
|
plane.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1);
|
||||||
fplane.create(FRAME_HEIGHT * 6, FRAME_WIDTH, CV_32FC1);
|
fplane.create(FRAME_HEIGHT * 6, FRAME_WIDTH, CV_32FC1);
|
||||||
luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3);
|
luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3);
|
||||||
|
|
||||||
shrunk.create(FRAME_HEIGHT / shr * HOG_LUV_BINS, FRAME_WIDTH / shr, CV_8UC1);
|
shrunk.create(FRAME_HEIGHT / shr * HOG_LUV_BINS, FRAME_WIDTH / shr, CV_8UC1);
|
||||||
integralBuffer.create(1 , (shrunk.rows + 1) * HOG_LUV_BINS * (shrunk.cols + 1), CV_32SC1);
|
integralBuffer.create(shrunk.rows, shrunk.cols, CV_32SC1);
|
||||||
hogluv.create((FRAME_HEIGHT / shr + 1) * HOG_LUV_BINS, FRAME_WIDTH / shr + 64, CV_32SC1);
|
|
||||||
|
hogluv.create((FRAME_HEIGHT / shr) * HOG_LUV_BINS + 1, FRAME_WIDTH / shr + 1, CV_32SC1);
|
||||||
|
hogluv.setTo(cv::Scalar::all(0));
|
||||||
|
|
||||||
detCounter.create(1,1, CV_32SC1);
|
detCounter.create(1,1, CV_32SC1);
|
||||||
|
|
||||||
octaves.upload(hoctaves);
|
octaves.upload(hoctaves);
|
||||||
@ -432,16 +442,7 @@ private:
|
|||||||
|
|
||||||
GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS));
|
GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS));
|
||||||
cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
|
cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
|
||||||
|
device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, 0);
|
||||||
fw /= shrinkage;
|
|
||||||
fh /= shrinkage;
|
|
||||||
|
|
||||||
for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
|
|
||||||
{
|
|
||||||
GpuMat channel(shrunk, cv::Rect(0, fh * i, fw, fh ));
|
|
||||||
GpuMat sum(hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1));
|
|
||||||
cv::gpu::integralBuffered(channel, sum, integralBuffer);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
@ -271,6 +271,7 @@ GPU_TEST_P(SoftCascadeTestAll, detect,
|
|||||||
ASSERT_EQ(detections.cols / sizeof(Detection) ,3670U);
|
ASSERT_EQ(detections.cols / sizeof(Detection) ,3670U);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//ToDo: fix me
|
||||||
GPU_TEST_P(SoftCascadeTestAll, detectOnIntegral,
|
GPU_TEST_P(SoftCascadeTestAll, detectOnIntegral,
|
||||||
ALL_DEVICES
|
ALL_DEVICES
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user