mirror of
https://github.com/opencv/opencv.git
synced 2025-08-05 22:19:14 +08:00
Merge pull request #24080 from dkurt:dnn_cuda_layers
Resolve uncovered CUDA dnn layer #24080 ### Pull Request Readiness Checklist * Gelu activation layer on CUDA * Try to relax GEMM from ONNX resolves https://github.com/opencv/opencv/issues/24064 See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
0245c0cd10
commit
96f23e3da1
@ -643,4 +643,69 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, testing::Values(std::make_tuple(D
|
||||
INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
|
||||
INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
|
||||
|
||||
|
||||
typedef TestBaseWithParam<tuple<Vec4i, int, bool, tuple<Backend, Target> > > Layer_FullyConnected;
|
||||
PERF_TEST_P_(Layer_FullyConnected, fc)
|
||||
{
|
||||
std::vector<int> inpShape;
|
||||
inpShape.reserve(4);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
int dim = get<0>(GetParam())[i];
|
||||
if (dim == 0)
|
||||
break;
|
||||
inpShape.push_back(dim);
|
||||
}
|
||||
Mat input(inpShape, CV_32F);
|
||||
randn(input, 0, 1);
|
||||
|
||||
int axis = input.dims - 1;
|
||||
int outDims = get<1>(GetParam());
|
||||
bool isMatMul = get<2>(GetParam());
|
||||
int backendId = get<0>(get<3>(GetParam()));
|
||||
int targetId = get<1>(get<3>(GetParam()));
|
||||
|
||||
std::vector<int> weightShape;
|
||||
if (isMatMul) {
|
||||
weightShape = inpShape;
|
||||
weightShape[weightShape.size() - 2] = outDims;
|
||||
} else {
|
||||
weightShape = {outDims, (int)input.total(axis, input.dims)};
|
||||
}
|
||||
Mat weights(weightShape, CV_32F);
|
||||
randn(weights, 0, 1);
|
||||
|
||||
LayerParams lp;
|
||||
lp.set("axis", input.dims - 1);
|
||||
lp.set("is_matmul", weights.dims > 2);
|
||||
lp.set("bias_term", false);
|
||||
lp.set("transB", true);
|
||||
lp.set("num_output", (int)weights.total(0, weights.dims - 1));
|
||||
lp.blobs.resize(1, weights);
|
||||
|
||||
Net net;
|
||||
net.addLayerToPrev("matmul", "InnerProduct", lp);
|
||||
|
||||
net.setInput(input);
|
||||
net.setPreferableBackend(backendId);
|
||||
net.setPreferableTarget(targetId);
|
||||
|
||||
// warmup
|
||||
Mat output = net.forward();
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
net.forward();
|
||||
}
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
INSTANTIATE_TEST_CASE_P(/**/, Layer_FullyConnected, Combine(
|
||||
Values( // input size
|
||||
Vec4i(5, 512, 384),
|
||||
Vec4i(5, 16, 512, 128)
|
||||
),
|
||||
Values(256, 512, 1024), // output dimension
|
||||
testing::Bool(), // is_matmul
|
||||
dnnBackendsAndTargets()
|
||||
));
|
||||
|
||||
} // namespace
|
||||
|
@ -248,6 +248,11 @@ void selu(const Stream& stream, Span<T> output, View<T> input, T alpha, T gamma)
|
||||
generic_op<T, SeluFunctor<T>>(stream, output, input, {alpha, gamma});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void gelu(const Stream& stream, Span<T> output, View<T> input) {
|
||||
generic_op<T, GeluFunctor<T>>(stream, output, input);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void sign(const Stream& stream, Span<T> output, View<T> input) {
|
||||
generic_op<T, SignFunctor<T>>(stream, output, input);
|
||||
@ -324,6 +329,7 @@ template void tan<__half>(const Stream&, Span<__half>, View<__half>);
|
||||
template void celu<__half>(const Stream&, Span<__half>, View<__half>, __half);
|
||||
template void hardsigmoid<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
|
||||
template void selu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
|
||||
template void gelu<__half>(const Stream&, Span<__half>, View<__half>);
|
||||
template void thresholdedrelu<__half>(const Stream&, Span<__half>, View<__half>, __half);
|
||||
template void power<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
|
||||
template void exp<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
|
||||
@ -366,6 +372,7 @@ template void tan<float>(const Stream&, Span<float>, View<float>);
|
||||
template void celu<float>(const Stream&, Span<float>, View<float>, float);
|
||||
template void hardsigmoid<float>(const Stream&, Span<float>, View<float>, float, float);
|
||||
template void selu<float>(const Stream&, Span<float>, View<float>, float, float);
|
||||
template void gelu<float>(const Stream&, Span<float>, View<float>);
|
||||
template void thresholdedrelu<float>(const Stream&, Span<float>, View<float>, float);
|
||||
template void power<float>(const Stream&, Span<float>, View<float>, float, float, float);
|
||||
template void exp<float>(const Stream&, Span<float>, View<float>, float, float);
|
||||
|
@ -588,6 +588,21 @@ struct SeluFunctor {
|
||||
T alpha, gamma;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct GeluFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() { }
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE GeluFunctor() { }
|
||||
CUDA4DNN_DEVICE GeluFunctor(const Params& params) { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T value) {
|
||||
using csl::device::erf;
|
||||
return static_cast<T>(0.5f) * value * (static_cast<T>(1.f) + erf(value * static_cast<T>(M_SQRT1_2)));
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct ThresholdedReluFunctor {
|
||||
struct Params {
|
||||
|
@ -114,6 +114,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
template <class T>
|
||||
void selu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha, T gamma);
|
||||
|
||||
template <class T>
|
||||
void gelu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
template <class T>
|
||||
void thresholdedrelu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha);
|
||||
|
||||
|
@ -537,6 +537,20 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
const T alpha, gamma;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class GeluOp final : public BaseOp<GeluOp, T> {
|
||||
public:
|
||||
GeluOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void calculate(csl::TensorSpan<T> output, csl::TensorView<T> input) const
|
||||
{
|
||||
kernels::gelu<T>(stream, output, input);
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class ThresholdedReluOp final : public BaseOp<ThresholdedReluOp, T> {
|
||||
public:
|
||||
|
@ -821,7 +821,7 @@ struct GeluFunctor : public BaseDefaultFunctor<GeluFunctor>
|
||||
|
||||
bool supportBackend(int backendId, int)
|
||||
{
|
||||
return backendId == DNN_BACKEND_OPENCV;
|
||||
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
|
||||
}
|
||||
|
||||
inline float calculate(float x) const
|
||||
@ -829,6 +829,13 @@ struct GeluFunctor : public BaseDefaultFunctor<GeluFunctor>
|
||||
return 0.5f * x * (1.0f + erf(x * M_SQRT1_2));
|
||||
}
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
|
||||
{
|
||||
return make_cuda_node<cuda4dnn::GeluOp>(target, stream);
|
||||
}
|
||||
#endif
|
||||
|
||||
int64 getFLOPSPerElement() const { return 100; }
|
||||
};
|
||||
|
||||
|
@ -630,8 +630,10 @@ public:
|
||||
|
||||
if(input_wrapper->getRank() == inp2Dim)
|
||||
return make_cuda_node<cuda4dnn::MatMulOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), oriMat, biasMat_, transA, transB);
|
||||
else
|
||||
else {
|
||||
CV_LOG_INFO(NULL, "DNN/CUDA: no implementation for MatMul with rank " << input_wrapper->getRank());
|
||||
return Ptr<BackendNode>();
|
||||
}
|
||||
}
|
||||
|
||||
auto flatten_start_axis = normalize_axis(axis, input_wrapper->getRank());
|
||||
|
@ -1965,9 +1965,11 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr
|
||||
}
|
||||
|
||||
int transB = layerParams.get<int>("transB", 0);
|
||||
int secondInpDims;
|
||||
if (constBlobs.find(node_proto.input(1)) != constBlobs.end())
|
||||
{
|
||||
Mat weights = getBlob(node_proto, 1);
|
||||
secondInpDims = weights.dims;
|
||||
|
||||
if (transA == 0) // optimized barnch, for now, we can only optimize the Gemm when transA = 0.
|
||||
{
|
||||
@ -1993,7 +1995,10 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
layerParams.set("transB", transB == 1);
|
||||
secondInpDims = outShapes[node_proto.input(1)].size();
|
||||
}
|
||||
|
||||
if (node_proto.input_size() == 3)
|
||||
{
|
||||
@ -2002,7 +2007,7 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr
|
||||
}
|
||||
|
||||
layerParams.set("bias_term", node_proto.input_size() == 3);
|
||||
layerParams.set("is_matmul", true);
|
||||
layerParams.set("is_matmul", secondInpDims > 2);
|
||||
addLayer(layerParams, node_proto);
|
||||
}
|
||||
|
||||
@ -2045,7 +2050,7 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node
|
||||
layerParams.blobs.push_back(transBlob);
|
||||
int numOutput = layerParams.blobs[0].total(0, secondInpDims - 1);
|
||||
layerParams.set("num_output", numOutput);
|
||||
layerParams.set("is_matmul", true);
|
||||
layerParams.set("is_matmul", secondInpDims > 2);
|
||||
} else
|
||||
secondInpDims = outShapes[node_proto.input(1)].size();
|
||||
|
||||
|
@ -102,7 +102,7 @@ public:
|
||||
netSoftmax.setInput(ref);
|
||||
ref = netSoftmax.forward();
|
||||
}
|
||||
normAssert(ref, out, "", l1 ? l1 : default_l1, lInf ? lInf : default_lInf);
|
||||
normAssert(ref, out, basename.c_str(), l1 ? l1 : default_l1, lInf ? lInf : default_lInf);
|
||||
if (checkNoFallbacks)
|
||||
expectNoFallbacksFromIE(net);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user