Improved thrust interop tutorial.

This commit is contained in:
Dan 2015-09-16 12:03:35 -04:00
parent 09d392f09d
commit 23fc5930b7
4 changed files with 104 additions and 47 deletions

1
.gitignore vendored
View File

@ -22,3 +22,4 @@ CMakeCache.txt
*.suo
*.log
*.tlog
build

View File

@ -0,0 +1,73 @@
Using a cv::cuda::GpuMat with thrust
===========================================
Goal
----
Thrust is an extremely powerful library for various cuda accelerated algorithms. However thrust is designed
to work with vectors and not pitched matricies. The following tutorial will discuss wrapping cv::cuda::GpuMat's
into thrust iterators that can be used with thrust algorithms.
This tutorial should show you how to:
- Wrap a GpuMat into a thrust iterator
- Fill a GpuMat with random numbers
- Sort a column of a GpuMat in place
- Copy values greater than 0 to a new gpu matrix
- Use streams with thrust
Wrapping a GpuMat into a thrust iterator
----
The following code will produce an iterator for a GpuMat
@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp begin_itr
@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp end_itr
Our goal is to have an iterator that will start at the beginning of the matrix, and increment correctly to access continuous matrix elements. This is trivial for a continuous row, but how about for a column
of a pitched matrix? To do this we need the iterator to be aware of the matrix dimensions and step. This information is embedded in the step_functor.
@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp step_functor
The step functor takes in an index value and returns the appropriate
offset from the beginning of the matrix. The counting iterator simply increments over the range of pixel elements. Combined into the transform_iterator we have an iterator that counts from 0 to M*N and correctly
increments to account for the pitched memory of a GpuMat. Unfortunately this does not include any memory location information, for that we need a thrust::device_ptr. By combining a device pointer with the
transform_iterator we can point thrust to the first element of our matrix and have it step accordingly.
Fill a GpuMat with random numbers
----
Now that we have some nice functions for making iterators for thrust, lets use them to do some things OpenCV can't do. Unfortunately at the time of this writing, OpenCV doesn't have any Gpu random number generation.
Thankfully thrust does and it's now trivial to interop between the two.
Example taken from http://stackoverflow.com/questions/12614164/generating-a-random-number-vector-between-0-and-1-0-using-thrust
First we need to write a functor that will produce our random values.
@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu prg
This will take in an integer value and output a value between a and b.
Now we will populate our matrix with values between 0 and 10 with a thrust transform.
@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu random
Sort a column of a GpuMat in place
----
Lets fill matrix elements with random values and an index. Afterwards we will sort the random numbers and the indecies.
@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu sort
Copy values greater than 0 to a new gpu matrix while using streams
----
In this example we're going to see how cv::cuda::Streams can be used with thrust. Unfortunately this specific example uses functions that must return
results to the CPU so it isn't the optimal use of streams.
@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu copy_greater
First we will populate a GPU mat with randomly generated data between -1 and 1 on a stream.
@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu random_gen_stream
Notice the use of thrust::system::cuda::par.on(...), this creates an execution policy for executing thrust code on a stream.
There is a bug in the version of thrust distributed with the cuda toolkit, as of version 7.5 this has not been fixed. This bug causes code to not execute on streams.
The bug can however be fixed by using the newest version of thrust from the git repository. (http://github.com/thrust/thrust.git)
Next we will determine how many values are greater than 0 by using thrust::count_if with the following predicate:
@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu pred_greater
We will use those results to create an output buffer for storing the copied values, we will then use copy_if with the same predicate to populate the output buffer.
Lastly we will download the values into a CPU mat for viewing.

View File

@ -6,20 +6,10 @@
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_ptr.h>
template<typename T> struct
CV_TYPE
{
static const int DEPTH;
};
template<> static const int CV_TYPE<float>::DEPTH = CV_32F;
template<> static const int CV_TYPE<double>::DEPTH = CV_64F;
template<> static const int CV_TYPE<int>::DEPTH = CV_32S;
template<> static const int CV_TYPE<uchar>::DEPTH = CV_8U;
template<> static const int CV_TYPE<char>::DEPTH = CV_8S;
template<> static const int CV_TYPE<ushort>::DEPTH = CV_16U;
template<> static const int CV_TYPE<short>::DEPTH = CV_16S;
/*
@Brief step_functor is an object to correctly step a thrust iterator according to the stride of a matrix
*/
//! [step_functor]
template<typename T> struct step_functor : public thrust::unary_function<int, int>
{
int columns;
@ -41,7 +31,8 @@ template<typename T> struct step_functor : public thrust::unary_function<int, in
return idx;
}
};
//! [step_functor]
//! [begin_itr]
/*
@Brief GpuMatBeginItr returns a thrust compatible iterator to the beginning of a GPU mat's memory.
@Param mat is the input matrix
@ -52,11 +43,13 @@ thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<s
{
if (channel == -1)
mat = mat.reshape(1);
CV_Assert(mat.depth() == CV_TYPE<T>::DEPTH);
CV_Assert(mat.depth() == cv::DataType<T>::depth);
CV_Assert(channel < mat.channels());
return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
}
//! [begin_itr]
//! [end_itr]
/*
@Brief GpuMatEndItr returns a thrust compatible iterator to the end of a GPU mat's memory.
@Param mat is the input matrix
@ -67,8 +60,11 @@ thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<s
{
if (channel == -1)
mat = mat.reshape(1);
CV_Assert(mat.depth() == CV_TYPE<T>::DEPTH);
CV_Assert(mat.depth() == cv::DataType<T>::depth);
CV_Assert(channel < mat.channels());
return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
thrust::make_transform_iterator(thrust::make_counting_iterator(mat.rows*mat.cols), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
}
//! [end_itr]

View File

@ -5,6 +5,7 @@
#include <thrust/random.h>
#include <thrust/sort.h>
#include <thrust/system/cuda/execution_policy.h>
//! [prg]
struct prg
{
float a, b;
@ -22,36 +23,10 @@ struct prg
return dist(rng);
}
};
//! [prg]
template<typename T> struct pred_eq
{
T value;
int channel;
__host__ __device__
pred_eq(T value_, int channel_ = 0) :value(value_), channel(channel_){}
__host__ __device__
bool operator()(const T val) const
{
return val == value;
}
template<int N>
__host__ __device__ bool operator()(const cv::Vec<T, N>& val)
{
if (channel < N)
return val.val[channel] == value;
return false;
}
__host__ __device__ bool operator()( const thrust::tuple<T, T, T>& val)
{
if (channel == 0)
return thrust::get<0>(val) == value;
if (channel == 1)
return thrust::get<1>(val) == value;
if (channel == 2)
return thrust::get<2>(val) == value;
}
};
//! [pred_greater]
template<typename T> struct pred_greater
{
T value;
@ -61,12 +36,14 @@ template<typename T> struct pred_greater
return val > value;
}
};
//! [pred_greater]
int main(void)
{
// Generate a 2 channel row matrix with 100 elements. Set the first channel to be the element index, and the second to be a randomly
// generated value. Sort by the randomly generated value while maintaining index association.
//! [sort]
{
cv::cuda::GpuMat d_idx(1, 100, CV_32SC2);
@ -82,8 +59,10 @@ int main(void)
cv::Mat h_idx(d_idx);
}
//! [sort]
// Randomly fill a row matrix with 100 elements between -1 and 1
//! [random]
{
cv::cuda::GpuMat d_value(1, 100, CV_32F);
auto valueBegin = GpuMatBeginItr<float>(d_value);
@ -92,8 +71,10 @@ int main(void)
cv::Mat h_value(d_value);
}
//! [random]
// OpenCV has count non zero, but what if you want to count a specific value?
//! [count_value]
{
cv::cuda::GpuMat d_value(1, 100, CV_32S);
d_value.setTo(cv::Scalar(0));
@ -101,18 +82,24 @@ int main(void)
auto count = thrust::count(GpuMatBeginItr<int>(d_value), GpuMatEndItr<int>(d_value), 15);
std::cout << count << std::endl;
}
//! [count_value]
// Randomly fill an array then copy only values greater than 0. Perform these tasks on a stream.
//! [copy_greater]
{
cv::cuda::GpuMat d_value(1, 100, CV_32F);
auto valueBegin = GpuMatBeginItr<float>(d_value);
auto valueEnd = GpuMatEndItr<float>(d_value);
cv::cuda::Stream stream;
//! [random_gen_stream]
thrust::transform(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_value.cols), valueBegin, prg(-1, 1));
//! [random_gen_stream]
int count = thrust::count_if(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), valueBegin, valueEnd, pred_greater<float>(0.0));
cv::cuda::GpuMat d_valueGreater(1, count, CV_32F);
thrust::copy_if(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), valueBegin, valueEnd, GpuMatBeginItr<float>(d_valueGreater), pred_greater<float>(0.0));
cv::Mat h_greater(d_valueGreater);
}
//! [copy_greater]
return 0;
}