mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 06:26:29 +08:00
optimize roi loads
only one thread load roi for all block
This commit is contained in:
parent
fdef0adf95
commit
30bce16ad6
@ -176,33 +176,35 @@ PERF_TEST_P(SoftCascadeTest, detect,
|
|||||||
{
|
{
|
||||||
if (runOnGpu)
|
if (runOnGpu)
|
||||||
{
|
{
|
||||||
cv::Mat cpu = readImage (GetParam().second);
|
cv::Mat cpu = readImage (GET_PARAM(1));
|
||||||
ASSERT_FALSE(cpu.empty());
|
ASSERT_FALSE(cpu.empty());
|
||||||
cv::gpu::GpuMat colored(cpu);
|
cv::gpu::GpuMat colored(cpu);
|
||||||
|
|
||||||
cv::gpu::SoftCascade cascade;
|
cv::gpu::SoftCascade cascade;
|
||||||
ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first)));
|
ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
|
||||||
|
|
||||||
cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
|
cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois;
|
||||||
|
rois.setTo(1);
|
||||||
rois.setTo(0);
|
cv::gpu::transpose(rois, trois);
|
||||||
cv::gpu::GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
|
cascade.detectMultiScale(colored, trois, objectBoxes);
|
||||||
sub.setTo(cv::Scalar::all(1));
|
|
||||||
cascade.detectMultiScale(colored, rois, objectBoxes);
|
|
||||||
|
|
||||||
TEST_CYCLE()
|
TEST_CYCLE()
|
||||||
{
|
{
|
||||||
cascade.detectMultiScale(colored, rois, objectBoxes);
|
cascade.detectMultiScale(colored, trois, objectBoxes);
|
||||||
}
|
}
|
||||||
} else
|
}
|
||||||
|
else
|
||||||
{
|
{
|
||||||
cv::Mat colored = readImage(GetParam().second);
|
cv::Mat colored = readImage(GET_PARAM(1));
|
||||||
ASSERT_FALSE(colored.empty());
|
ASSERT_FALSE(colored.empty());
|
||||||
|
|
||||||
cv::SoftCascade cascade;
|
cv::SoftCascade cascade;
|
||||||
ASSERT_TRUE(cascade.load(getDataPath(GetParam().first)));
|
ASSERT_TRUE(cascade.load(getDataPath(GET_PARAM(0))));
|
||||||
|
|
||||||
std::vector<cv::Rect> rois, objectBoxes;
|
std::vector<cv::Rect> rois;
|
||||||
|
|
||||||
|
typedef cv::SoftCascade::Detection Detection;
|
||||||
|
std::vector<Detection>objectBoxes;
|
||||||
cascade.detectMultiScale(colored, rois, objectBoxes);
|
cascade.detectMultiScale(colored, rois, objectBoxes);
|
||||||
|
|
||||||
TEST_CYCLE()
|
TEST_CYCLE()
|
||||||
@ -262,13 +264,16 @@ PERF_TEST_P(SoftCascadeTestRoi, detectInRoi,
|
|||||||
sub.setTo(1);
|
sub.setTo(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cv::gpu::GpuMat trois;
|
||||||
|
cv::gpu::transpose(rois, trois);
|
||||||
|
|
||||||
cv::gpu::GpuMat curr = objectBoxes;
|
cv::gpu::GpuMat curr = objectBoxes;
|
||||||
cascade.detectMultiScale(colored, rois, curr);
|
cascade.detectMultiScale(colored, trois, curr);
|
||||||
|
|
||||||
TEST_CYCLE()
|
TEST_CYCLE()
|
||||||
{
|
{
|
||||||
curr = objectBoxes;
|
curr = objectBoxes;
|
||||||
cascade.detectMultiScale(colored, rois, curr);
|
cascade.detectMultiScale(colored, trois, curr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -301,7 +306,10 @@ PERF_TEST_P(SoftCascadeTestRoi, detectEachRoi,
|
|||||||
sub.setTo(1);
|
sub.setTo(1);
|
||||||
|
|
||||||
cv::gpu::GpuMat curr = objectBoxes;
|
cv::gpu::GpuMat curr = objectBoxes;
|
||||||
cascade.detectMultiScale(colored, rois, curr);
|
cv::gpu::GpuMat trois;
|
||||||
|
cv::gpu::transpose(rois, trois);
|
||||||
|
|
||||||
|
cascade.detectMultiScale(colored, trois, curr);
|
||||||
|
|
||||||
TEST_CYCLE()
|
TEST_CYCLE()
|
||||||
{
|
{
|
||||||
@ -372,7 +380,7 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
|
|||||||
cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
|
cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
|
||||||
ASSERT_FALSE(img.empty());
|
ASSERT_FALSE(img.empty());
|
||||||
|
|
||||||
if (PERF_RUN_GPU())
|
if (runOnGpu)
|
||||||
{
|
{
|
||||||
cv::gpu::CascadeClassifier_GPU d_cascade;
|
cv::gpu::CascadeClassifier_GPU d_cascade;
|
||||||
ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
|
ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
|
||||||
|
@ -86,7 +86,6 @@ namespace icf {
|
|||||||
}
|
}
|
||||||
|
|
||||||
texture<int, cudaTextureType2D, cudaReadModeElementType> thogluv;
|
texture<int, cudaTextureType2D, cudaReadModeElementType> thogluv;
|
||||||
texture<char, cudaTextureType2D, cudaReadModeElementType> troi;
|
|
||||||
|
|
||||||
template<bool isUp>
|
template<bool isUp>
|
||||||
__device__ __forceinline__ float rescale(const Level& level, Node& node)
|
__device__ __forceinline__ float rescale(const Level& level, Node& node)
|
||||||
@ -130,11 +129,6 @@ namespace icf {
|
|||||||
float relScale = level.relScale;
|
float relScale = level.relScale;
|
||||||
float farea = scaledRect.z * scaledRect.w;
|
float farea = scaledRect.z * scaledRect.w;
|
||||||
|
|
||||||
dprintf("%d: feature %d box %d %d %d %d\n",threadIdx.x, (node.threshold >> 28), scaledRect.x, scaledRect.y,
|
|
||||||
scaledRect.z, scaledRect.w);
|
|
||||||
dprintf("%d: rescale: %f [%f %f] selected %f\n",threadIdx.x, level.relScale, level.scaling[0], level.scaling[1],
|
|
||||||
level.scaling[(node.threshold >> 28) > 6]);
|
|
||||||
|
|
||||||
// rescale
|
// rescale
|
||||||
scaledRect.x = __float2int_rn(relScale * scaledRect.x);
|
scaledRect.x = __float2int_rn(relScale * scaledRect.x);
|
||||||
scaledRect.y = __float2int_rn(relScale * scaledRect.y);
|
scaledRect.y = __float2int_rn(relScale * scaledRect.y);
|
||||||
@ -146,15 +140,7 @@ namespace icf {
|
|||||||
const float expected_new_area = farea * relScale * relScale;
|
const float expected_new_area = farea * relScale * relScale;
|
||||||
float approx = __fdividef(sarea, expected_new_area);
|
float approx = __fdividef(sarea, expected_new_area);
|
||||||
|
|
||||||
dprintf("%d: new rect: %d box %d %d %d %d rel areas %f %f\n",threadIdx.x, (node.threshold >> 28),
|
float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6];
|
||||||
scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
|
|
||||||
|
|
||||||
float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx;
|
|
||||||
|
|
||||||
rootThreshold *= level.scaling[(node.threshold >> 28) > 6];
|
|
||||||
|
|
||||||
dprintf("%d: approximation %f %d -> %f %f\n",threadIdx.x, approx, (node.threshold & 0x0FFFFFFFU), rootThreshold,
|
|
||||||
level.scaling[(node.threshold >> 28) > 6]);
|
|
||||||
|
|
||||||
return rootThreshold;
|
return rootThreshold;
|
||||||
}
|
}
|
||||||
@ -162,33 +148,17 @@ namespace icf {
|
|||||||
template<bool isUp>
|
template<bool isUp>
|
||||||
__device__ __forceinline__ int get(int x, int y, uchar4 area)
|
__device__ __forceinline__ int get(int x, int y, uchar4 area)
|
||||||
{
|
{
|
||||||
|
|
||||||
dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w);
|
|
||||||
dprintf("%d: extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",threadIdx.x,
|
|
||||||
x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w,
|
|
||||||
x + area.x, y + area.w);
|
|
||||||
dprintf("%d: at point %d %d with offset %d\n", x, y, 0);
|
|
||||||
|
|
||||||
int a = tex2D(thogluv, x + area.x, y + area.y);
|
int a = tex2D(thogluv, x + area.x, y + area.y);
|
||||||
int b = tex2D(thogluv, x + area.z, y + area.y);
|
int b = tex2D(thogluv, x + area.z, y + area.y);
|
||||||
int c = tex2D(thogluv, x + area.z, y + area.w);
|
int c = tex2D(thogluv, x + area.z, y + area.w);
|
||||||
int d = tex2D(thogluv, x + area.x, y + area.w);
|
int d = tex2D(thogluv, x + area.x, y + area.w);
|
||||||
|
|
||||||
dprintf("%d retruved integral values: %d %d %d %d\n",threadIdx.x, a, b, c, d);
|
|
||||||
|
|
||||||
return (a - b + c - d);
|
return (a - b + c - d);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
__device__ __forceinline__ int get<true>(int x, int y, uchar4 area)
|
__device__ __forceinline__ int get<true>(int x, int y, uchar4 area)
|
||||||
{
|
{
|
||||||
|
|
||||||
dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w);
|
|
||||||
dprintf("%d: extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",threadIdx.x,
|
|
||||||
x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w,
|
|
||||||
x + area.x, y + area.w);
|
|
||||||
dprintf("%d: at point %d %d with offset %d\n", x, y, 0);
|
|
||||||
|
|
||||||
x += area.x;
|
x += area.x;
|
||||||
y += area.y;
|
y += area.y;
|
||||||
int a = tex2D(thogluv, x, y);
|
int a = tex2D(thogluv, x, y);
|
||||||
@ -196,11 +166,10 @@ namespace icf {
|
|||||||
int c = tex2D(thogluv, x + area.z, y + area.w);
|
int c = tex2D(thogluv, x + area.z, y + area.w);
|
||||||
int d = tex2D(thogluv, x, y + area.w);
|
int d = tex2D(thogluv, x, y + area.w);
|
||||||
|
|
||||||
dprintf("%d retruved integral values: %d %d %d %d\n",threadIdx.x, a, b, c, d);
|
|
||||||
|
|
||||||
return (a - b + c - d);
|
return (a - b + c - d);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
texture<float2, cudaTextureType2D, cudaReadModeElementType> troi;
|
||||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
|
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
|
||||||
template<bool isUp>
|
template<bool isUp>
|
||||||
__global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
|
__global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
|
||||||
@ -210,12 +179,21 @@ namespace icf {
|
|||||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
const int x = blockIdx.x;
|
const int x = blockIdx.x;
|
||||||
|
|
||||||
|
__shared__ volatile char roiCache[8];
|
||||||
|
|
||||||
|
if (!threadIdx.y && !threadIdx.x)
|
||||||
|
{
|
||||||
|
((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x);
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (!roiCache[threadIdx.y]) return;
|
||||||
|
|
||||||
Level level = levels[downscales + blockIdx.z];
|
Level level = levels[downscales + blockIdx.z];
|
||||||
|
|
||||||
if(x >= level.workRect.x || y >= level.workRect.y) return;
|
if(x >= level.workRect.x || y >= level.workRect.y) return;
|
||||||
|
|
||||||
if (!tex2D(troi, x, y)) return;
|
|
||||||
|
|
||||||
Octave octave = octaves[level.octave];
|
Octave octave = octaves[level.octave];
|
||||||
int st = octave.index * octave.stages;
|
int st = octave.index * octave.stages;
|
||||||
const int stEnd = st + 1024;
|
const int stEnd = st + 1024;
|
||||||
@ -282,9 +260,9 @@ namespace icf {
|
|||||||
// if (blockIdx.z != 31) return;
|
// if (blockIdx.z != 31) return;
|
||||||
if(x >= level.workRect.x || y >= level.workRect.y) return;
|
if(x >= level.workRect.x || y >= level.workRect.y) return;
|
||||||
|
|
||||||
int roi = tex2D(troi, x, y);
|
// int roi = tex2D(troi, x, y);
|
||||||
printf("%d\n", roi);
|
// printf("%d\n", roi);
|
||||||
if (!roi) return;
|
// if (!roi) return;
|
||||||
|
|
||||||
Octave octave = octaves[level.octave];
|
Octave octave = octaves[level.octave];
|
||||||
|
|
||||||
@ -357,8 +335,8 @@ namespace icf {
|
|||||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
|
cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
|
||||||
cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
|
cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
|
||||||
|
|
||||||
cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<char>();
|
cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
|
||||||
cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols, roi.rows, roi.step));
|
cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
|
||||||
|
|
||||||
test_kernel_warp<false><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, 0);
|
test_kernel_warp<false><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, 0);
|
||||||
cudaSafeCall( cudaGetLastError());
|
cudaSafeCall( cudaGetLastError());
|
||||||
@ -391,8 +369,8 @@ namespace icf {
|
|||||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
|
cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
|
||||||
cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
|
cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
|
||||||
|
|
||||||
cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<char>();
|
cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
|
||||||
cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols, roi.rows, roi.step));
|
cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
|
||||||
|
|
||||||
if (scale >= downscales)
|
if (scale >= downscales)
|
||||||
test_kernel_warp<true><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
|
test_kernel_warp<true><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
|
||||||
|
@ -481,7 +481,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
|
|||||||
CV_Assert(colored.type() == CV_8UC3);
|
CV_Assert(colored.type() == CV_8UC3);
|
||||||
|
|
||||||
// we guess user knows about shrincage
|
// we guess user knows about shrincage
|
||||||
CV_Assert((rois.size() == getRoiSize()) && (rois.type() == CV_8UC1));
|
CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1));
|
||||||
|
|
||||||
// only this window size allowed
|
// only this window size allowed
|
||||||
CV_Assert(colored.cols == Filds::FRAME_WIDTH && colored.rows == Filds::FRAME_HEIGHT);
|
CV_Assert(colored.cols == Filds::FRAME_WIDTH && colored.rows == Filds::FRAME_HEIGHT);
|
||||||
|
@ -47,7 +47,7 @@
|
|||||||
using cv::gpu::GpuMat;
|
using cv::gpu::GpuMat;
|
||||||
|
|
||||||
// show detection results on input image with cv::imshow
|
// show detection results on input image with cv::imshow
|
||||||
//#define SHOW_DETECTIONS
|
#define SHOW_DETECTIONS
|
||||||
|
|
||||||
#if defined SHOW_DETECTIONS
|
#if defined SHOW_DETECTIONS
|
||||||
# define SHOW(res) \
|
# define SHOW(res) \
|
||||||
@ -154,26 +154,30 @@ GPU_TEST_P(SoftCascadeTest, detectInROI,
|
|||||||
cv::gpu::SoftCascade cascade;
|
cv::gpu::SoftCascade cascade;
|
||||||
ASSERT_TRUE(cascade.load(cvtest::TS::ptr()->get_data_path() + GET_PARAM(0)));
|
ASSERT_TRUE(cascade.load(cvtest::TS::ptr()->get_data_path() + GET_PARAM(0)));
|
||||||
|
|
||||||
GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
|
GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois;
|
||||||
rois.setTo(0);
|
rois.setTo(0);
|
||||||
|
|
||||||
int nroi = GET_PARAM(2);
|
int nroi = GET_PARAM(2);
|
||||||
|
cv::Mat result(coloredCpu);
|
||||||
cv::RNG rng;
|
cv::RNG rng;
|
||||||
for (int i = 0; i < nroi; ++i)
|
for (int i = 0; i < nroi; ++i)
|
||||||
{
|
{
|
||||||
cv::Rect r = getFromTable(rng(10));
|
cv::Rect r = getFromTable(rng(10));
|
||||||
GpuMat sub(rois, r);
|
GpuMat sub(rois, r);
|
||||||
sub.setTo(1);
|
sub.setTo(1);
|
||||||
|
r.x *= 4; r.y *= 4; r.width *= 4; r.height *= 4;
|
||||||
|
cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
cascade.detectMultiScale(colored, rois, objectBoxes);
|
cv::gpu::transpose(rois, trois);
|
||||||
|
|
||||||
|
cascade.detectMultiScale(colored, trois, objectBoxes);
|
||||||
|
|
||||||
///
|
///
|
||||||
cv::Mat dt(objectBoxes);
|
cv::Mat dt(objectBoxes);
|
||||||
typedef cv::gpu::SoftCascade::Detection detection_t;
|
typedef cv::gpu::SoftCascade::Detection detection_t;
|
||||||
|
|
||||||
detection_t* dts = (detection_t*)dt.data;
|
detection_t* dts = (detection_t*)dt.data;
|
||||||
cv::Mat result(coloredCpu);
|
|
||||||
|
|
||||||
printTotal(std::cout, dt.cols);
|
printTotal(std::cout, dt.cols);
|
||||||
for (int i = 0; i < (int)(dt.cols / sizeof(detection_t)); ++i)
|
for (int i = 0; i < (int)(dt.cols / sizeof(detection_t)); ++i)
|
||||||
@ -204,8 +208,11 @@ GPU_TEST_P(SoftCascadeTest, detectInLevel,
|
|||||||
GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(detection_t), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
|
GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(detection_t), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
|
||||||
rois.setTo(1);
|
rois.setTo(1);
|
||||||
|
|
||||||
|
cv::gpu::GpuMat trois;
|
||||||
|
cv::gpu::transpose(rois, trois);
|
||||||
|
|
||||||
int level = GET_PARAM(2);
|
int level = GET_PARAM(2);
|
||||||
cascade.detectMultiScale(colored, rois, objectBoxes, 1, level);
|
cascade.detectMultiScale(colored, trois, objectBoxes, 1, level);
|
||||||
|
|
||||||
cv::Mat dt(objectBoxes);
|
cv::Mat dt(objectBoxes);
|
||||||
|
|
||||||
@ -246,6 +253,9 @@ TEST(SoftCascadeTest, detect)
|
|||||||
GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
|
GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
|
||||||
sub.setTo(cv::Scalar::all(1));
|
sub.setTo(cv::Scalar::all(1));
|
||||||
|
|
||||||
cascade.detectMultiScale(colored, rois, objectBoxes);
|
cv::gpu::GpuMat trois;
|
||||||
|
cv::gpu::transpose(rois, trois);
|
||||||
|
|
||||||
|
cascade.detectMultiScale(colored, trois, objectBoxes);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
Loading…
Reference in New Issue
Block a user