Merge pull request #8367 from khnaba:cuda-calchist-with-mask

Implement cv::cuda::calcHist with mask support (#8367)

* Implement cuda::calcHist with mask

* Fix documentation build warning

* Have their own step sizes for src and mask. Fix review comment.
This commit is contained in:
Naba Kumar 2017-03-15 11:34:00 +02:00 committed by Alexander Alekhin
parent a83a1cafa7
commit 27cf6e549e
4 changed files with 131 additions and 1 deletions

View File

@ -201,6 +201,15 @@ CV_EXPORTS void alphaComp(InputArray img1, InputArray img2, OutputArray dst, int
*/
CV_EXPORTS void calcHist(InputArray src, OutputArray hist, Stream& stream = Stream::Null());
/** @brief Calculates histogram for one channel 8-bit image confined in given mask.
@param src Source image with CV_8UC1 type.
@param hist Destination histogram with one row, 256 columns, and the CV_32SC1 type.
@param mask A mask image same size as src and of type CV_8UC1.
@param stream Stream for the asynchronous version.
*/
CV_EXPORTS void calcHist(InputArray src, InputArray mask, OutputArray hist, Stream& stream = Stream::Null());
/** @brief Equalizes the histogram of a grayscale image.
@param src Source image with CV_8UC1 type.

View File

@ -105,6 +105,72 @@ namespace hist
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist)
{
__shared__ int shist[256];
const int y = blockIdx.x * blockDim.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
shist[tid] = 0;
__syncthreads();
if (y < rows)
{
const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep);
const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep);
const int cols_4 = cols / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
{
unsigned int data = rowPtr[x];
unsigned int m = maskRowPtr[x];
if ((m >> 0) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
if ((m >> 8) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
if ((m >> 16) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
if ((m >> 24) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
}
if (cols % 4 != 0 && threadIdx.x == 0)
{
for (int x = cols_4 * 4; x < cols; ++x)
{
unsigned int data = ((const uchar*)rowPtr)[x];
unsigned int m = ((const uchar*)maskRowPtr)[x];
if (m)
Emulation::smem::atomicAdd(&shist[data], 1);
}
}
}
__syncthreads();
const int histVal = shist[tid];
if (histVal > 0)
::atomicAdd(hist + tid, histVal);
}
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(src.rows, block.y));
histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
/////////////////////////////////////////////////////////////////////////

View File

@ -69,20 +69,32 @@ void cv::cuda::histRange(InputArray, GpuMat*, const GpuMat*, Stream&) { throw_no
namespace hist
{
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream);
}
void cv::cuda::calcHist(InputArray _src, OutputArray _hist, Stream& stream)
{
calcHist(_src, cv::cuda::GpuMat(), _hist, stream);
}
void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, Stream& stream)
{
GpuMat src = _src.getGpuMat();
GpuMat mask = _mask.getGpuMat();
CV_Assert( src.type() == CV_8UC1 );
CV_Assert( mask.empty() || mask.type() == CV_8UC1 );
CV_Assert( mask.empty() || mask.size() == src.size() );
_hist.create(1, 256, CV_32SC1);
GpuMat hist = _hist.getGpuMat();
hist.setTo(Scalar::all(0), stream);
hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
if (mask.empty())
hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
else
hist::histogram256(src, mask, hist.ptr<int>(), StreamAccessor::getStream(stream));
}
////////////////////////////////////////////////////////////////////////

View File

@ -136,6 +136,49 @@ INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHist, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES));
PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
{
cv::cuda::DeviceInfo devInfo;
cv::Size size;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
size = GET_PARAM(1);
cv::cuda::setDevice(devInfo.deviceID());
}
};
CUDA_TEST_P(CalcHistWithMask, Accuracy)
{
cv::Mat src = randomMat(size, CV_8UC1);
cv::Mat mask = randomMat(size, CV_8UC1);
cv::Mat(mask, cv::Rect(0, 0, size.width / 2, size.height / 2)).setTo(0);
cv::cuda::GpuMat hist;
cv::cuda::calcHist(loadMat(src), loadMat(mask), hist);
cv::Mat hist_gold;
const int hbins = 256;
const float hranges[] = {0.0f, 256.0f};
const int histSize[] = {hbins};
const float* ranges[] = {hranges};
const int channels[] = {0};
cv::calcHist(&src, 1, channels, mask, hist_gold, 1, histSize, ranges);
hist_gold = hist_gold.reshape(1, 1);
hist_gold.convertTo(hist_gold, CV_32S);
EXPECT_MAT_NEAR(hist_gold, hist, 0.0);
}
INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHistWithMask, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES));
///////////////////////////////////////////////////////////////////////////////////////////////////////
// EqualizeHist