Merge pull request #24080 from dkurt:dnn_cuda_layers

Resolve uncovered CUDA dnn layer #24080

### Pull Request Readiness Checklist

* Gelu activation layer on CUDA
* Try to relax GEMM from ONNX

resolves https://github.com/opencv/opencv/issues/24064

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
Dmitry Kurtaev 2023-08-03 09:13:42 +03:00 committed by GitHub
parent 0245c0cd10
commit 96f23e3da1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 123 additions and 5 deletions

View File

@ -643,4 +643,69 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, testing::Values(std::make_tuple(D
INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
typedef TestBaseWithParam<tuple<Vec4i, int, bool, tuple<Backend, Target> > > Layer_FullyConnected;
PERF_TEST_P_(Layer_FullyConnected, fc)
{
std::vector<int> inpShape;
inpShape.reserve(4);
for (int i = 0; i < 4; ++i) {
int dim = get<0>(GetParam())[i];
if (dim == 0)
break;
inpShape.push_back(dim);
}
Mat input(inpShape, CV_32F);
randn(input, 0, 1);
int axis = input.dims - 1;
int outDims = get<1>(GetParam());
bool isMatMul = get<2>(GetParam());
int backendId = get<0>(get<3>(GetParam()));
int targetId = get<1>(get<3>(GetParam()));
std::vector<int> weightShape;
if (isMatMul) {
weightShape = inpShape;
weightShape[weightShape.size() - 2] = outDims;
} else {
weightShape = {outDims, (int)input.total(axis, input.dims)};
}
Mat weights(weightShape, CV_32F);
randn(weights, 0, 1);
LayerParams lp;
lp.set("axis", input.dims - 1);
lp.set("is_matmul", weights.dims > 2);
lp.set("bias_term", false);
lp.set("transB", true);
lp.set("num_output", (int)weights.total(0, weights.dims - 1));
lp.blobs.resize(1, weights);
Net net;
net.addLayerToPrev("matmul", "InnerProduct", lp);
net.setInput(input);
net.setPreferableBackend(backendId);
net.setPreferableTarget(targetId);
// warmup
Mat output = net.forward();
TEST_CYCLE()
{
net.forward();
}
SANITY_CHECK_NOTHING();
}
INSTANTIATE_TEST_CASE_P(/**/, Layer_FullyConnected, Combine(
Values( // input size
Vec4i(5, 512, 384),
Vec4i(5, 16, 512, 128)
),
Values(256, 512, 1024), // output dimension
testing::Bool(), // is_matmul
dnnBackendsAndTargets()
));
} // namespace

View File

@ -248,6 +248,11 @@ void selu(const Stream& stream, Span<T> output, View<T> input, T alpha, T gamma)
generic_op<T, SeluFunctor<T>>(stream, output, input, {alpha, gamma});
}
template <class T>
void gelu(const Stream& stream, Span<T> output, View<T> input) {
generic_op<T, GeluFunctor<T>>(stream, output, input);
}
template <class T>
void sign(const Stream& stream, Span<T> output, View<T> input) {
generic_op<T, SignFunctor<T>>(stream, output, input);
@ -324,6 +329,7 @@ template void tan<__half>(const Stream&, Span<__half>, View<__half>);
template void celu<__half>(const Stream&, Span<__half>, View<__half>, __half);
template void hardsigmoid<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
template void selu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
template void gelu<__half>(const Stream&, Span<__half>, View<__half>);
template void thresholdedrelu<__half>(const Stream&, Span<__half>, View<__half>, __half);
template void power<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
template void exp<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
@ -366,6 +372,7 @@ template void tan<float>(const Stream&, Span<float>, View<float>);
template void celu<float>(const Stream&, Span<float>, View<float>, float);
template void hardsigmoid<float>(const Stream&, Span<float>, View<float>, float, float);
template void selu<float>(const Stream&, Span<float>, View<float>, float, float);
template void gelu<float>(const Stream&, Span<float>, View<float>);
template void thresholdedrelu<float>(const Stream&, Span<float>, View<float>, float);
template void power<float>(const Stream&, Span<float>, View<float>, float, float, float);
template void exp<float>(const Stream&, Span<float>, View<float>, float, float);

View File

@ -588,6 +588,21 @@ struct SeluFunctor {
T alpha, gamma;
};
template <class T>
struct GeluFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() { }
};
CUDA4DNN_DEVICE GeluFunctor() { }
CUDA4DNN_DEVICE GeluFunctor(const Params& params) { }
CUDA4DNN_DEVICE T operator()(T value) {
using csl::device::erf;
return static_cast<T>(0.5f) * value * (static_cast<T>(1.f) + erf(value * static_cast<T>(M_SQRT1_2)));
}
};
template <class T>
struct ThresholdedReluFunctor {
struct Params {

View File

@ -114,6 +114,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
template <class T>
void selu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha, T gamma);
template <class T>
void gelu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
template <class T>
void thresholdedrelu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha);

View File

@ -537,6 +537,20 @@ namespace cv { namespace dnn { namespace cuda4dnn {
const T alpha, gamma;
};
template <class T>
class GeluOp final : public BaseOp<GeluOp, T> {
public:
GeluOp(csl::Stream stream_) : stream(std::move(stream_)) { }
void calculate(csl::TensorSpan<T> output, csl::TensorView<T> input) const
{
kernels::gelu<T>(stream, output, input);
}
private:
csl::Stream stream;
};
template <class T>
class ThresholdedReluOp final : public BaseOp<ThresholdedReluOp, T> {
public:

View File

@ -821,7 +821,7 @@ struct GeluFunctor : public BaseDefaultFunctor<GeluFunctor>
bool supportBackend(int backendId, int)
{
return backendId == DNN_BACKEND_OPENCV;
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
}
inline float calculate(float x) const
@ -829,6 +829,13 @@ struct GeluFunctor : public BaseDefaultFunctor<GeluFunctor>
return 0.5f * x * (1.0f + erf(x * M_SQRT1_2));
}
#ifdef HAVE_CUDA
Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
{
return make_cuda_node<cuda4dnn::GeluOp>(target, stream);
}
#endif
int64 getFLOPSPerElement() const { return 100; }
};

View File

@ -630,8 +630,10 @@ public:
if(input_wrapper->getRank() == inp2Dim)
return make_cuda_node<cuda4dnn::MatMulOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), oriMat, biasMat_, transA, transB);
else
else {
CV_LOG_INFO(NULL, "DNN/CUDA: no implementation for MatMul with rank " << input_wrapper->getRank());
return Ptr<BackendNode>();
}
}
auto flatten_start_axis = normalize_axis(axis, input_wrapper->getRank());

View File

@ -1965,9 +1965,11 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr
}
int transB = layerParams.get<int>("transB", 0);
int secondInpDims;
if (constBlobs.find(node_proto.input(1)) != constBlobs.end())
{
Mat weights = getBlob(node_proto, 1);
secondInpDims = weights.dims;
if (transA == 0) // optimized barnch, for now, we can only optimize the Gemm when transA = 0.
{
@ -1993,7 +1995,10 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr
}
}
else
{
layerParams.set("transB", transB == 1);
secondInpDims = outShapes[node_proto.input(1)].size();
}
if (node_proto.input_size() == 3)
{
@ -2002,7 +2007,7 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr
}
layerParams.set("bias_term", node_proto.input_size() == 3);
layerParams.set("is_matmul", true);
layerParams.set("is_matmul", secondInpDims > 2);
addLayer(layerParams, node_proto);
}
@ -2045,7 +2050,7 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node
layerParams.blobs.push_back(transBlob);
int numOutput = layerParams.blobs[0].total(0, secondInpDims - 1);
layerParams.set("num_output", numOutput);
layerParams.set("is_matmul", true);
layerParams.set("is_matmul", secondInpDims > 2);
} else
secondInpDims = outShapes[node_proto.input(1)].size();

View File

@ -102,7 +102,7 @@ public:
netSoftmax.setInput(ref);
ref = netSoftmax.forward();
}
normAssert(ref, out, "", l1 ? l1 : default_l1, lInf ? lInf : default_lInf);
normAssert(ref, out, basename.c_str(), l1 ? l1 : default_l1, lInf ? lInf : default_lInf);
if (checkNoFallbacks)
expectNoFallbacksFromIE(net);
}