mirror of
https://github.com/opencv/opencv.git
synced 2025-06-24 21:10:56 +08:00
reduce slice, concat to copy; enable more concat fusions
This commit is contained in:
parent
657c8d1c65
commit
cbdaa93e54
@ -16,6 +16,8 @@
|
|||||||
#include "../cuda4dnn/csl/tensor.hpp"
|
#include "../cuda4dnn/csl/tensor.hpp"
|
||||||
#include "../cuda4dnn/csl/span.hpp"
|
#include "../cuda4dnn/csl/span.hpp"
|
||||||
|
|
||||||
|
#include "../cuda4dnn/kernels/fill_copy.hpp"
|
||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -95,6 +97,20 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
|||||||
TensorSpan<T> output, std::size_t output_axis_offset,
|
TensorSpan<T> output, std::size_t output_axis_offset,
|
||||||
TensorView<T> input, std::size_t axis)
|
TensorView<T> input, std::size_t axis)
|
||||||
{
|
{
|
||||||
|
CV_Assert(output.rank() == input.rank());
|
||||||
|
CV_Assert(output_axis_offset < output.get_axis_size(axis));
|
||||||
|
|
||||||
|
/* if axes preceeding the concat axis are all singleton, the concat blocks are contiguous
|
||||||
|
* in the output and we can copy each block directly
|
||||||
|
*/
|
||||||
|
if (output.size_range(0, axis) == 1)
|
||||||
|
{
|
||||||
|
auto stride = output.size_range(axis + 1, output.rank());
|
||||||
|
auto sliced_output = Span<T>(output.get() + output_axis_offset * stride, input.size());
|
||||||
|
kernels::copy<T>(stream, sliced_output, input);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
/* let's call the axis of interest as the channel axis for the purpose of the following discussion
|
/* let's call the axis of interest as the channel axis for the purpose of the following discussion
|
||||||
* even though it can be any axis
|
* even though it can be any axis
|
||||||
*
|
*
|
||||||
|
@ -15,11 +15,14 @@
|
|||||||
#include "../cuda4dnn/csl/tensor.hpp"
|
#include "../cuda4dnn/csl/tensor.hpp"
|
||||||
#include "../cuda4dnn/csl/span.hpp"
|
#include "../cuda4dnn/csl/span.hpp"
|
||||||
|
|
||||||
|
#include "../cuda4dnn/kernels/fill_copy.hpp"
|
||||||
|
|
||||||
#include <opencv2/core.hpp>
|
#include <opencv2/core.hpp>
|
||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
using namespace cv::dnn::cuda4dnn::csl;
|
using namespace cv::dnn::cuda4dnn::csl;
|
||||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||||
@ -79,6 +82,14 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
|||||||
CV_Assert(output.rank() == input.rank());
|
CV_Assert(output.rank() == input.rank());
|
||||||
CV_Assert(output.rank() == offsets.size());
|
CV_Assert(output.rank() == offsets.size());
|
||||||
|
|
||||||
|
/* copy directly if no slicing is required */
|
||||||
|
if (is_shape_same(output, input))
|
||||||
|
{
|
||||||
|
CV_Assert(std::all_of(std::begin(offsets), std::end(offsets), [] (std::size_t x) { return x == 0; }));
|
||||||
|
kernels::copy<T>(stream, output, input);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
/* squeezable axes at the beginning of both tensors can be eliminated
|
/* squeezable axes at the beginning of both tensors can be eliminated
|
||||||
*
|
*
|
||||||
* Reasoning:
|
* Reasoning:
|
||||||
@ -146,6 +157,27 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
|||||||
|
|
||||||
auto rank = inShape.size();
|
auto rank = inShape.size();
|
||||||
|
|
||||||
|
/* We can do a copy if the reduced rank is two and only the first axis is sliced.
|
||||||
|
* The general requirement is that only one axis is sliced and all the axes that
|
||||||
|
* preceed the sliced axis are singleton. However, the reductions above will remove
|
||||||
|
* all the leading singleton axes and merge the trailing unsliced axes into one, or
|
||||||
|
* zero if there are no trailing unsliced axes. The latter is handled separately.
|
||||||
|
*/
|
||||||
|
if (rank == 2 && offsets[0] != 0 && offsets[1] == 0)
|
||||||
|
{
|
||||||
|
auto stride = inShape[1];
|
||||||
|
auto sliced_input = View<T>(input.get() + offsets[0] * stride, output.size());
|
||||||
|
kernels::copy<T>(stream, output, sliced_input);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rank == 1)
|
||||||
|
{
|
||||||
|
auto sliced_input = View<T>(input.get() + offsets[0], output.size());
|
||||||
|
kernels::copy<T>(stream, output, sliced_input);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<std::size_t> inStride(rank), outStride(rank);
|
std::vector<std::size_t> inStride(rank), outStride(rank);
|
||||||
inStride.back() = 1;
|
inStride.back() = 1;
|
||||||
outStride.back() = 1;
|
outStride.back() = 1;
|
||||||
|
@ -47,20 +47,6 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
|||||||
|
|
||||||
CV_Assert(offsets.size() == outputs.size());
|
CV_Assert(offsets.size() == outputs.size());
|
||||||
|
|
||||||
/* one output with the same shape as the input => direct copy */
|
|
||||||
if (outputs.size() == 1)
|
|
||||||
{
|
|
||||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
|
||||||
auto output = output_wrapper->getSpan();
|
|
||||||
|
|
||||||
if (is_shape_same(output, input))
|
|
||||||
{
|
|
||||||
CV_Assert(std::all_of(std::begin(offsets[0]), std::end(offsets[0]), [] (std::size_t x) { return x == 0; }));
|
|
||||||
kernels::copy<T>(stream, output, input);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < outputs.size(); ++i)
|
for (int i = 0; i < outputs.size(); ++i)
|
||||||
{
|
{
|
||||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||||
|
@ -2788,7 +2788,13 @@ struct Net::Impl : public detail::NetImplBase
|
|||||||
if (preferableBackend == DNN_BACKEND_CUDA &&
|
if (preferableBackend == DNN_BACKEND_CUDA &&
|
||||||
(inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false ||
|
(inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false ||
|
||||||
(inp_i_data->layerInstance->type != "Convolution" &&
|
(inp_i_data->layerInstance->type != "Convolution" &&
|
||||||
inp_i_data->layerInstance->type != "Pooling")))
|
inp_i_data->layerInstance->type != "Pooling" &&
|
||||||
|
inp_i_data->layerInstance->type != "Resize" &&
|
||||||
|
inp_i_data->layerInstance->type != "Flatten" &&
|
||||||
|
inp_i_data->layerInstance->type != "Permute" &&
|
||||||
|
inp_i_data->layerInstance->type != "Reorg" &&
|
||||||
|
inp_i_data->layerInstance->type != "Eltwise" &&
|
||||||
|
inp_i_data->layerInstance.dynamicCast<ActivationLayer>().empty())))
|
||||||
{
|
{
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user