mirror of
https://github.com/opencv/opencv.git
synced 2025-08-05 22:19:14 +08:00
Merge pull request #25147 from fengyuentau:dnn/elementwise_layers/speedup
* added v_erf and implemented gelu acceleration via vectorization * remove anonymous v_erf and use v_erf from intrin_math * enable perf for ov and cuda backend
This commit is contained in:
parent
31b308f882
commit
e3858cc5a3
@ -975,4 +975,49 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_Softmax, Combine(
|
||||
/* withCann= */ false) // only test on CPU
|
||||
));
|
||||
|
||||
using Layer_Elementwise = TestBaseWithParam<tuple<std::vector<int>, std::string, tuple<Backend, Target>>>;
|
||||
PERF_TEST_P_(Layer_Elementwise, elementwise) {
|
||||
std::vector<int> input_shape = get<0>(GetParam());
|
||||
std::string op = get<1>(GetParam());
|
||||
int backend_id = get<0>(get<2>(GetParam()));
|
||||
int target_id = get<1>(get<2>(GetParam()));
|
||||
|
||||
Mat input(input_shape, CV_32F);
|
||||
randn(input, 0.f, 1.f);
|
||||
|
||||
LayerParams lp;
|
||||
lp.type = op;
|
||||
lp.name = "TestLayer";
|
||||
|
||||
Net net;
|
||||
net.addLayerToPrev(lp.name, lp.type, lp);
|
||||
|
||||
// Warmup
|
||||
{
|
||||
net.setInput(input);
|
||||
net.setPreferableBackend(backend_id);
|
||||
net.setPreferableTarget(target_id);
|
||||
Mat out = net.forward();
|
||||
}
|
||||
|
||||
TEST_CYCLE() {
|
||||
net.forward();
|
||||
}
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(/**/, Layer_Elementwise, testing::Combine(
|
||||
testing::Values(std::vector<int>{1, 50, 3072}),
|
||||
testing::Values(std::string{"Gelu"}),
|
||||
dnnBackendsAndTargets(/* withInferenceEngine= */ true,
|
||||
/* withHalide= */ false,
|
||||
/* withCpuOCV= */ true,
|
||||
/* withVkCom= */ false,
|
||||
/* withCUDA= */ true,
|
||||
/* withNgraph= */ true,
|
||||
/* withWebnn= */ false,
|
||||
/* withCann= */ false) // only test on CPU
|
||||
));
|
||||
|
||||
} // namespace
|
||||
|
@ -813,20 +813,82 @@ private:
|
||||
static const char* const ocl_kernel_name;
|
||||
};
|
||||
|
||||
struct GeluFunctor : public BaseDefaultFunctor<GeluFunctor>
|
||||
{
|
||||
typedef GeluLayer Layer;
|
||||
namespace {
|
||||
// Refer to v_erf in modules/core/include/opencv2/core/hal/intrin_math.hpp
|
||||
constexpr float c_erf_coef0 = 0.3275911f;
|
||||
constexpr float c_erf_coef1 = 1.061405429f;
|
||||
constexpr float c_erf_coef2 = -1.453152027f;
|
||||
constexpr float c_erf_coef3 = 1.421413741f;
|
||||
constexpr float c_erf_coef4 = -0.284496736f;
|
||||
constexpr float c_erf_coef5 = 0.254829592f;
|
||||
|
||||
explicit GeluFunctor() {}
|
||||
inline float erf_approx(float v) {
|
||||
float t = 1.f / fmaf(fabsf(v), c_erf_coef0, 1.f);
|
||||
float r = fmaf(c_erf_coef1, t, c_erf_coef2);
|
||||
r = fmaf(r, t, c_erf_coef3);
|
||||
r = fmaf(r, t, c_erf_coef4);
|
||||
r = fmaf(r, t, c_erf_coef5);
|
||||
r = 1.f - r * t * expf(-v * v);
|
||||
return std::copysignf(r, v);
|
||||
}
|
||||
}
|
||||
|
||||
bool supportBackend(int backendId, int)
|
||||
{
|
||||
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
|
||||
struct GeluFunctor : public BaseFunctor {
|
||||
using Layer = GeluLayer;
|
||||
int vlanes;
|
||||
|
||||
explicit GeluFunctor() {
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
vlanes = VTraits<v_float32>::vlanes();
|
||||
#else
|
||||
vlanes = 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline float calculate(float x) const
|
||||
{
|
||||
return 0.5f * x * (1.0f + erf(x * M_SQRT1_2));
|
||||
bool supportBackend(int backendId, int) {
|
||||
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
|
||||
}
|
||||
|
||||
void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const {
|
||||
CV_UNUSED(stripeStart);
|
||||
for (int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize) {
|
||||
int i = 0;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
// 0.5f * x * (1.0f + erf(x * M_SQRT1_2));
|
||||
v_float32 half = vx_setall_f32(0.5f),
|
||||
one = vx_setall_f32(1.0f),
|
||||
reciprocal_sqrt2 = vx_setall_f32(M_SQRT1_2);
|
||||
for (; i <= len - vlanes; i += vlanes) {
|
||||
if (i + vlanes > len) {
|
||||
if (i == 0 || i == len) {
|
||||
break;
|
||||
}
|
||||
i = len - vlanes;
|
||||
}
|
||||
v_float32 x0 = vx_load(srcptr + i);
|
||||
|
||||
// t = x * M_SQRT1_2
|
||||
v_float32 t0 = v_mul(reciprocal_sqrt2, x0);
|
||||
|
||||
// t = 1.0f + t
|
||||
t0 = v_add(one, v_erf(t0));
|
||||
|
||||
// x = 0.5 * x
|
||||
x0 = v_mul(half, x0);
|
||||
|
||||
// x = x * t
|
||||
x0 = v_mul(x0, t0);
|
||||
|
||||
vx_store(dstptr + i, x0);
|
||||
}
|
||||
#endif
|
||||
// 0.5f * x * (1.0f + erf(x * M_SQRT1_2));
|
||||
for( ; i < len; i++ )
|
||||
{
|
||||
float x = srcptr[i];
|
||||
dstptr[i] = 0.5f * x * (1.0f + erf_approx(x * M_SQRT1_2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
@ -836,12 +898,55 @@ struct GeluFunctor : public BaseDefaultFunctor<GeluFunctor>
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
bool initKernel(ocl::Kernel &ker, const UMat &src) const
|
||||
{
|
||||
String buildopt = oclGetTMacro(src);
|
||||
|
||||
if (!ker.create("GeluForward", ocl::dnn::activations_oclsrc, buildopt))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
|
||||
{
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
|
||||
for (size_t i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
UMat& src = inputs[i];
|
||||
UMat& dst = outputs[i];
|
||||
CV_Assert(src.isContinuous() && dst.isContinuous() && !src.offset && !dst.offset);
|
||||
|
||||
ocl::Kernel kernel;
|
||||
CV_Assert(initKernel(kernel, src));
|
||||
kernel.set(0, (int)src.total());
|
||||
kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
|
||||
kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
|
||||
|
||||
size_t gSize = src.total();
|
||||
CV_Assert(kernel.run(1, &gSize, NULL, false));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_DNN_NGRAPH
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
|
||||
{
|
||||
return std::make_shared<ov::op::v0::Gelu>(node);
|
||||
}
|
||||
#endif // HAVE_DNN_NGRAPH
|
||||
|
||||
int64 getFLOPSPerElement() const { return 100; }
|
||||
};
|
||||
|
||||
template<>
|
||||
const char* const BaseDefaultFunctor<GeluFunctor>::ocl_kernel_name = "GeluForward";
|
||||
|
||||
namespace GeluApproximationConstants
|
||||
{
|
||||
static constexpr float sqrt_2_pi = 0.7978845834732056f;
|
||||
|
@ -48,6 +48,10 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#if !defined(M_SQRT1_2)
|
||||
#define M_SQRT1_2 0.707106781186547524400844362104849039 /* 1/sqrt(2) */
|
||||
#endif
|
||||
|
||||
__kernel void ReLUForward(const int count, __global const T* in, __global T* out
|
||||
#ifndef RELU_NO_SLOPE
|
||||
, KERNEL_ARG_DTYPE negative_slope
|
||||
|
@ -282,6 +282,14 @@ static const TestCase testConformanceConfig[] = {
|
||||
{"test_gathernd_example_float32", 2, 1},
|
||||
{"test_gathernd_example_int32", 2, 1},
|
||||
{"test_gathernd_example_int32_batch_dim1", 2, 1},
|
||||
{"test_gelu_default_1", 1, 1},
|
||||
{"test_gelu_default_1_expanded", 1, 1},
|
||||
{"test_gelu_default_2", 1, 1},
|
||||
{"test_gelu_default_2_expanded", 1, 1},
|
||||
{"test_gelu_tanh_1", 1, 1},
|
||||
{"test_gelu_tanh_1_expanded", 1, 1},
|
||||
{"test_gelu_tanh_2", 1, 1},
|
||||
{"test_gelu_tanh_2_expanded", 1, 1},
|
||||
{"test_gemm_all_attributes", 3, 1},
|
||||
{"test_gemm_alpha", 3, 1},
|
||||
{"test_gemm_beta", 3, 1},
|
||||
@ -1123,6 +1131,19 @@ TEST_P(Test_ONNX_conformance, Layer_Test)
|
||||
if (name == "test_pow") {
|
||||
default_lInf = 0.00013; // Expected: (normInf) <= (lInf), actual: 0.00012207 vs 0.0001
|
||||
}
|
||||
if (name == "test_gelu_tanh_1") {
|
||||
default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101805 vs 1e-05
|
||||
default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152707 vs 0.0001
|
||||
}
|
||||
if (name == "test_gelu_tanh_2") {
|
||||
if (target == DNN_TARGET_OPENCL_FP16) {
|
||||
default_l1 = 0.00016; // Expected: (normL1) <= (l1), actual: 0.000157223 vs 9e-05
|
||||
default_lInf = 0.0016; // Expected: (normInf) <= (lInf), actual: 0.00153041 vs 0.0005
|
||||
} else {
|
||||
default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80073e-05 vs 1e-05
|
||||
default_lInf = 0.0005; // Expected: (normInf) <= (lInf), actual: 0.000455521 vs 0.0001
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef HAVE_HALIDE
|
||||
else if (backend == DNN_BACKEND_HALIDE)
|
||||
@ -1146,6 +1167,15 @@ TEST_P(Test_ONNX_conformance, Layer_Test)
|
||||
{
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_VULKAN, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE);
|
||||
}
|
||||
|
||||
if (name == "test_gelu_tanh_1") {
|
||||
default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101805 vs 1e-05
|
||||
default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152707 vs 0.0001
|
||||
}
|
||||
if (name == "test_gelu_tanh_2") {
|
||||
default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80073e-05 vs 1e-05
|
||||
default_lInf = 0.0005; // Expected: (normInf) <= (lInf), actual: 0.000455521 vs 0.0001
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_CUDA
|
||||
@ -1159,6 +1189,20 @@ TEST_P(Test_ONNX_conformance, Layer_Test)
|
||||
{
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE);
|
||||
}
|
||||
|
||||
if (name == "test_gelu_tanh_1") {
|
||||
default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101815 vs 1e-05
|
||||
default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152737 vs 0.0001
|
||||
}
|
||||
if (name == "test_gelu_tanh_2") {
|
||||
if (target == DNN_TARGET_CUDA_FP16) {
|
||||
default_l1 = 0.00023; // Expected: (normL1) <= (l1), actual: 0.000220591 vs 9e-05
|
||||
default_lInf = 0.0023; // Expected: (normInf) <= (lInf), actual: 0.00220466 vs 0.0005
|
||||
} else {
|
||||
default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80127e-05 vs 1e-05
|
||||
default_lInf = 0.0005; // Expected: (normInf) <= (lInf), actual: 0.000455445 vs 0.0001
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
else
|
||||
|
@ -688,6 +688,36 @@ CASE(test_gathernd_example_int32)
|
||||
// no filter
|
||||
CASE(test_gathernd_example_int32_batch_dim1)
|
||||
// no filter
|
||||
CASE(test_gelu_default_1)
|
||||
// no filter
|
||||
CASE(test_gelu_default_1_expanded)
|
||||
// no filter
|
||||
CASE(test_gelu_default_2)
|
||||
// no filter
|
||||
CASE(test_gelu_default_2_expanded)
|
||||
// no filter
|
||||
CASE(test_gelu_tanh_1)
|
||||
if (target == DNN_TARGET_CPU) {
|
||||
default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101805 vs 1e-05
|
||||
default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152707 vs 0.0001
|
||||
}
|
||||
if (target == DNN_TARGET_OPENCL) {
|
||||
default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101815 vs 1e-05
|
||||
default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152737 vs 0.0001
|
||||
}
|
||||
CASE(test_gelu_tanh_1_expanded)
|
||||
// no filter
|
||||
CASE(test_gelu_tanh_2)
|
||||
if (target == DNN_TARGET_CPU) {
|
||||
default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80057e-05 vs 1e-05
|
||||
default_lInf = 0.00046; // Expected: (normInf) <= (lInf), actual: 0.000455521 vs 0.0001
|
||||
}
|
||||
if (target == DNN_TARGET_OPENCL) {
|
||||
default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80144e-05 vs 1e-05
|
||||
default_lInf = 0.00046; // Expected: (normInf) <= (lInf), actual: 0.000455445 vs 0.0001
|
||||
}
|
||||
CASE(test_gelu_tanh_2_expanded)
|
||||
// no filter
|
||||
CASE(test_gemm_all_attributes)
|
||||
// no filter
|
||||
CASE(test_gemm_alpha)
|
||||
|
@ -117,6 +117,10 @@
|
||||
"test_gathernd_example_float32",
|
||||
"test_gathernd_example_int32",
|
||||
"test_gathernd_example_int32_batch_dim1",
|
||||
"test_gelu_default_1_expanded", // parser: no corresponding layer for CastLike
|
||||
"test_gelu_default_2_expanded", // parser: no corresponding layer for CastLike
|
||||
"test_gelu_tanh_1_expanded", // parser: no corresponding layer for CastLike
|
||||
"test_gelu_tanh_2_expanded", // parser: no corresponding layer for CastLike
|
||||
"test_gemm_all_attributes",
|
||||
"test_gemm_alpha",
|
||||
"test_gemm_beta",
|
||||
|
@ -3149,6 +3149,15 @@ TEST_P(Test_ONNX_nets, ViT_B_32) {
|
||||
l1 = 0.008;
|
||||
lInf = 0.04;
|
||||
}
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
|
||||
if (target == DNN_TARGET_CPU) {
|
||||
l1 = 4.4e-5; // Expected: (normL1) <= (l1), actual: 4.31208e-05 vs 1e-05
|
||||
lInf = 0.0002; // Expected: (normInf) <= (lInf), actual: 0.000194907 vs 0.0001
|
||||
} else if (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16) {
|
||||
l1 = 0.0092; // Expected: (normL1) <= (l1), actual: 0.00918349 vs 4.4e-05
|
||||
lInf = 0.056; // Expected: (normInf) <= (lInf), actual: 0.0556431 vs 0.0002
|
||||
}
|
||||
}
|
||||
|
||||
normAssert(ref, out, "ViTB_32", l1, lInf);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user