mirror of
https://github.com/opencv/opencv.git
synced 2025-06-10 19:24:07 +08:00
Merge pull request #25881 from fengyuentau:dnn/cpu/optimize_activations_with_v_exp
dnn: optimize activations with v_exp #25881 Merge with https://github.com/opencv/opencv_extra/pull/1191. This PR optimizes the following activations: - [x] Swish - [x] Mish - [x] Elu - [x] Celu - [x] Selu - [x] HardSwish ### Performance (Updated on 2024-07-18) #### AmLogic A311D2 (ARM Cortex A73 + A53) ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 115.859 27.930 4.15 Elu::Layer_Elementwise::OCV/CPU 27.846 27.003 1.03 Gelu::Layer_Elementwise::OCV/CPU 0.657 0.602 1.09 HardSwish::Layer_Elementwise::OCV/CPU 31.885 6.781 4.70 Mish::Layer_Elementwise::OCV/CPU 35.729 32.089 1.11 Selu::Layer_Elementwise::OCV/CPU 61.955 27.850 2.22 Swish::Layer_Elementwise::OCV/CPU 30.819 26.688 1.15 ``` #### Apple M1 ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 16.184 2.118 7.64 Celu::Layer_Elementwise::OCV/CPU_FP16 16.280 2.123 7.67 Elu::Layer_Elementwise::OCV/CPU 9.123 1.878 4.86 Elu::Layer_Elementwise::OCV/CPU_FP16 9.085 1.897 4.79 Gelu::Layer_Elementwise::OCV/CPU 0.089 0.081 1.11 Gelu::Layer_Elementwise::OCV/CPU_FP16 0.086 0.074 1.17 HardSwish::Layer_Elementwise::OCV/CPU 1.560 1.555 1.00 HardSwish::Layer_Elementwise::OCV/CPU_FP16 1.536 1.523 1.01 Mish::Layer_Elementwise::OCV/CPU 6.077 2.476 2.45 Mish::Layer_Elementwise::OCV/CPU_FP16 5.990 2.496 2.40 Selu::Layer_Elementwise::OCV/CPU 11.351 1.976 5.74 Selu::Layer_Elementwise::OCV/CPU_FP16 11.533 1.985 5.81 Swish::Layer_Elementwise::OCV/CPU 4.687 1.890 2.48 Swish::Layer_Elementwise::OCV/CPU_FP16 4.715 1.873 2.52 ``` #### Intel i7-12700K ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 17.106 3.560 4.81 Elu::Layer_Elementwise::OCV/CPU 5.064 3.478 1.46 Gelu::Layer_Elementwise::OCV/CPU 0.036 0.035 1.04 HardSwish::Layer_Elementwise::OCV/CPU 2.914 2.893 1.01 Mish::Layer_Elementwise::OCV/CPU 3.820 3.529 1.08 Selu::Layer_Elementwise::OCV/CPU 10.799 3.593 3.01 Swish::Layer_Elementwise::OCV/CPU 3.651 3.473 1.05 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
26714f9a34
commit
23b244d3a3
@ -975,49 +975,72 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_Softmax, Combine(
|
|||||||
/* withCann= */ false) // only test on CPU
|
/* withCann= */ false) // only test on CPU
|
||||||
));
|
));
|
||||||
|
|
||||||
using Layer_Elementwise = TestBaseWithParam<tuple<std::vector<int>, std::string, tuple<Backend, Target>>>;
|
struct Layer_Elementwise : public TestBaseWithParam<tuple<Backend, Target>> {
|
||||||
PERF_TEST_P_(Layer_Elementwise, elementwise) {
|
void test_layer(const std::string &op_type, const std::vector<int> &input_shape) {
|
||||||
std::vector<int> input_shape = get<0>(GetParam());
|
int backend_id = get<0>(GetParam());
|
||||||
std::string op = get<1>(GetParam());
|
int target_id = get<1>(GetParam());
|
||||||
int backend_id = get<0>(get<2>(GetParam()));
|
|
||||||
int target_id = get<1>(get<2>(GetParam()));
|
|
||||||
|
|
||||||
Mat input(input_shape, CV_32F);
|
Mat input(input_shape, CV_32F);
|
||||||
randn(input, 0.f, 1.f);
|
randu(input, -10.0f, 10.f);
|
||||||
|
|
||||||
LayerParams lp;
|
LayerParams lp;
|
||||||
lp.type = op;
|
lp.type = op_type;
|
||||||
lp.name = "TestLayer";
|
lp.name = cv::format("PerfLayer/%s", op_type.c_str());
|
||||||
|
|
||||||
Net net;
|
Net net;
|
||||||
net.addLayerToPrev(lp.name, lp.type, lp);
|
net.addLayerToPrev(lp.name, lp.type, lp);
|
||||||
|
|
||||||
// Warmup
|
// Warmup
|
||||||
{
|
{
|
||||||
net.setInput(input);
|
net.setInput(input);
|
||||||
net.setPreferableBackend(backend_id);
|
net.setPreferableBackend(backend_id);
|
||||||
net.setPreferableTarget(target_id);
|
net.setPreferableTarget(target_id);
|
||||||
Mat out = net.forward();
|
net.forward();
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_CYCLE() {
|
||||||
|
net.forward();
|
||||||
|
}
|
||||||
|
|
||||||
|
SANITY_CHECK_NOTHING();
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CYCLE() {
|
int N = 2;
|
||||||
net.forward();
|
int C = 32;
|
||||||
}
|
int H = 416;
|
||||||
|
int W = 416;
|
||||||
|
};
|
||||||
|
|
||||||
SANITY_CHECK_NOTHING();
|
PERF_TEST_P_(Layer_Elementwise, Gelu) {
|
||||||
|
test_layer("Gelu", std::vector<int>{1, 50, 3072});
|
||||||
|
}
|
||||||
|
PERF_TEST_P_(Layer_Elementwise, Swish) {
|
||||||
|
test_layer("Swish", std::vector<int>{N, C, H, W});
|
||||||
|
}
|
||||||
|
PERF_TEST_P_(Layer_Elementwise, Mish) {
|
||||||
|
test_layer("Mish", std::vector<int>{N, C, H, W});
|
||||||
|
}
|
||||||
|
PERF_TEST_P_(Layer_Elementwise, Elu) {
|
||||||
|
test_layer("ELU", std::vector<int>{N, C, H, W});
|
||||||
|
}
|
||||||
|
PERF_TEST_P_(Layer_Elementwise, Celu) {
|
||||||
|
test_layer("Celu", std::vector<int>{N, C, H, W});
|
||||||
|
}
|
||||||
|
PERF_TEST_P_(Layer_Elementwise, Selu) {
|
||||||
|
test_layer("Selu", std::vector<int>{N, C, H, W});
|
||||||
|
}
|
||||||
|
PERF_TEST_P_(Layer_Elementwise, HardSwish) {
|
||||||
|
test_layer("HardSwish", std::vector<int>{N, C, H, W});
|
||||||
}
|
}
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(/**/, Layer_Elementwise, testing::Combine(
|
INSTANTIATE_TEST_CASE_P(/**/, Layer_Elementwise,
|
||||||
testing::Values(std::vector<int>{1, 50, 3072}),
|
dnnBackendsAndTargets(/* withInferenceEngine= */ true,
|
||||||
testing::Values(std::string{"Gelu"}),
|
/* withHalide= */ false,
|
||||||
dnnBackendsAndTargets(/* withInferenceEngine= */ true,
|
/* withCpuOCV= */ true,
|
||||||
/* withHalide= */ false,
|
/* withVkCom= */ false,
|
||||||
/* withCpuOCV= */ true,
|
/* withCUDA= */ true,
|
||||||
/* withVkCom= */ false,
|
/* withNgraph= */ true,
|
||||||
/* withCUDA= */ true,
|
/* withWebnn= */ false,
|
||||||
/* withNgraph= */ true,
|
/* withCann= */ false));
|
||||||
/* withWebnn= */ false,
|
|
||||||
/* withCann= */ false) // only test on CPU
|
|
||||||
));
|
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
@ -859,12 +859,6 @@ struct GeluFunctor : public BaseFunctor {
|
|||||||
one = vx_setall_f32(1.0f),
|
one = vx_setall_f32(1.0f),
|
||||||
reciprocal_sqrt2 = vx_setall_f32(M_SQRT1_2);
|
reciprocal_sqrt2 = vx_setall_f32(M_SQRT1_2);
|
||||||
for (; i <= len - vlanes; i += vlanes) {
|
for (; i <= len - vlanes; i += vlanes) {
|
||||||
if (i + vlanes > len) {
|
|
||||||
if (i == 0 || i == len) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
i = len - vlanes;
|
|
||||||
}
|
|
||||||
v_float32 x0 = vx_load(srcptr + i);
|
v_float32 x0 = vx_load(srcptr + i);
|
||||||
|
|
||||||
// t = x * M_SQRT1_2
|
// t = x * M_SQRT1_2
|
||||||
@ -1048,7 +1042,17 @@ const char* const TanHFunctor::BaseDefaultFunctor<TanHFunctor>::ocl_kernel_name
|
|||||||
|
|
||||||
struct SwishFunctor : public BaseDefaultFunctor<SwishFunctor>
|
struct SwishFunctor : public BaseDefaultFunctor<SwishFunctor>
|
||||||
{
|
{
|
||||||
typedef SwishLayer Layer;
|
using Layer = SwishLayer;
|
||||||
|
|
||||||
|
int vlanes;
|
||||||
|
|
||||||
|
explicit SwishFunctor() {
|
||||||
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||||
|
vlanes = VTraits<v_float32>::vlanes();
|
||||||
|
#else
|
||||||
|
vlanes = 1;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
bool supportBackend(int backendId, int)
|
bool supportBackend(int backendId, int)
|
||||||
{
|
{
|
||||||
@ -1064,6 +1068,32 @@ struct SwishFunctor : public BaseDefaultFunctor<SwishFunctor>
|
|||||||
return x / (1.f + exp(-x));
|
return x / (1.f + exp(-x));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const {
|
||||||
|
CV_UNUSED(stripeStart);
|
||||||
|
for (int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize) {
|
||||||
|
int i = 0;
|
||||||
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||||
|
// x / (1.f + exp(-x));
|
||||||
|
v_float32 one = vx_setall_f32(1.0f),
|
||||||
|
zero = vx_setzero_f32();
|
||||||
|
for (; i <= len - vlanes; i += vlanes) {
|
||||||
|
v_float32 x = vx_load(srcptr + i);
|
||||||
|
|
||||||
|
v_float32 t = v_sub(zero, x);
|
||||||
|
t = v_exp(t);
|
||||||
|
t = v_add(one, t);
|
||||||
|
t = v_div(x, t);
|
||||||
|
|
||||||
|
vx_store(dstptr + i, t);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// In case SIMD is not available or len < vlanes
|
||||||
|
for (; i < len; i++) {
|
||||||
|
dstptr[i] = calculate(srcptr[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef HAVE_CUDA
|
#ifdef HAVE_CUDA
|
||||||
Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
|
Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
|
||||||
{
|
{
|
||||||
@ -1116,9 +1146,27 @@ struct SwishFunctor : public BaseDefaultFunctor<SwishFunctor>
|
|||||||
template<>
|
template<>
|
||||||
const char* const SwishFunctor::BaseDefaultFunctor<SwishFunctor>::ocl_kernel_name = "SwishForward";
|
const char* const SwishFunctor::BaseDefaultFunctor<SwishFunctor>::ocl_kernel_name = "SwishForward";
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
constexpr float MISH_THRESHOLD = -36.73f;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
This implementation is derived from
|
||||||
|
https://github.com/vpisarev/ficus/blob/3c9a8b78f49e17489c5e1fd6dd5dd487348c99c2/lib/NN/OpElemwise.fx#L110
|
||||||
|
*/
|
||||||
struct MishFunctor : public BaseDefaultFunctor<MishFunctor>
|
struct MishFunctor : public BaseDefaultFunctor<MishFunctor>
|
||||||
{
|
{
|
||||||
typedef MishLayer Layer;
|
using Layer = MishLayer;
|
||||||
|
|
||||||
|
int vlanes;
|
||||||
|
|
||||||
|
explicit MishFunctor() {
|
||||||
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||||
|
vlanes = VTraits<v_float32>::vlanes();
|
||||||
|
#else
|
||||||
|
vlanes = 1;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
bool supportBackend(int backendId, int)
|
bool supportBackend(int backendId, int)
|
||||||
{
|
{
|
||||||
@ -1131,15 +1179,34 @@ struct MishFunctor : public BaseDefaultFunctor<MishFunctor>
|
|||||||
|
|
||||||
inline float calculate(float x) const
|
inline float calculate(float x) const
|
||||||
{
|
{
|
||||||
// Use fast approximation introduced in https://github.com/opencv/opencv/pull/17200
|
float y = x > MISH_THRESHOLD ? std::exp(-x) : 1.f;
|
||||||
if (x >= 8.f)
|
x *= x > MISH_THRESHOLD ? 1.f : 0.f;
|
||||||
{
|
return x * (1 + 2 * y) / (1 + 2 * y + 2 * y * y);
|
||||||
return x;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
float eX = exp(x);
|
void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const {
|
||||||
float n = (eX + 2.f) * eX;
|
CV_UNUSED(stripeStart);
|
||||||
return (x * n) / (n + 2.f);
|
for (int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize) {
|
||||||
|
int i = 0;
|
||||||
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||||
|
v_float32 v_threshold = vx_setall_f32(MISH_THRESHOLD), one = vx_setall_f32(1.f), z = vx_setzero_f32();
|
||||||
|
for (; i <= len - vlanes; i += vlanes) {
|
||||||
|
v_float32 x = vx_load(srcptr + i);
|
||||||
|
|
||||||
|
x = v_select(v_le(x, v_threshold), z, x);
|
||||||
|
v_float32 y = v_exp(v_sub(z, x));
|
||||||
|
v_float32 _2y = v_add(y, y),
|
||||||
|
_2ya1 = v_add(_2y, one);
|
||||||
|
x = v_div(v_mul(x, _2ya1), v_add(_2ya1, v_mul(_2y, y)));
|
||||||
|
|
||||||
|
vx_store(dstptr + i, x);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// In case SIMD is not available or len < vlanes
|
||||||
|
for (; i < len; i++) {
|
||||||
|
dstptr[i] = calculate(srcptr[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAVE_CUDA
|
#ifdef HAVE_CUDA
|
||||||
@ -1270,10 +1337,18 @@ const char* const SigmoidFunctor::BaseDefaultFunctor<SigmoidFunctor>::ocl_kernel
|
|||||||
|
|
||||||
struct ELUFunctor : public BaseDefaultFunctor<ELUFunctor>
|
struct ELUFunctor : public BaseDefaultFunctor<ELUFunctor>
|
||||||
{
|
{
|
||||||
typedef ELULayer Layer;
|
using Layer = ELULayer;
|
||||||
float alpha;
|
|
||||||
|
|
||||||
explicit ELUFunctor(float alpha_ = 1.f) : alpha(alpha_) {}
|
float alpha;
|
||||||
|
int vlanes;
|
||||||
|
|
||||||
|
explicit ELUFunctor(float alpha_ = 1.f) : alpha(alpha_) {
|
||||||
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||||
|
vlanes = VTraits<v_float32>::vlanes();
|
||||||
|
#else
|
||||||
|
vlanes = 1;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
bool supportBackend(int backendId, int)
|
bool supportBackend(int backendId, int)
|
||||||
{
|
{
|
||||||
@ -1292,6 +1367,28 @@ struct ELUFunctor : public BaseDefaultFunctor<ELUFunctor>
|
|||||||
return x >= 0.f ? x : alpha * (exp(x) - 1.f);
|
return x >= 0.f ? x : alpha * (exp(x) - 1.f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const {
|
||||||
|
CV_UNUSED(stripeStart);
|
||||||
|
for (int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize) {
|
||||||
|
int i = 0;
|
||||||
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||||
|
v_float32 z = vx_setzero_f32(), v_alpha = vx_setall_f32(alpha), one = vx_setall_f32(1.0f);
|
||||||
|
for (; i <= len - vlanes; i += vlanes) {
|
||||||
|
v_float32 x = vx_load(srcptr + i);
|
||||||
|
|
||||||
|
v_float32 t = v_mul(v_alpha, v_sub(v_exp(x), one));
|
||||||
|
x = v_select(v_ge(x, z), x, t);
|
||||||
|
|
||||||
|
vx_store(dstptr + i, x);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// In case SIMD is not available or len < vlanes
|
||||||
|
for (; i < len; i++) {
|
||||||
|
dstptr[i] = calculate(srcptr[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
inline void setKernelParams(ocl::Kernel& kernel) const
|
inline void setKernelParams(ocl::Kernel& kernel) const
|
||||||
{
|
{
|
||||||
kernel.set(3, alpha);
|
kernel.set(3, alpha);
|
||||||
@ -1991,7 +2088,16 @@ const char* const BaseDefaultFunctor<ErfFunctor>::ocl_kernel_name = "ErfForward"
|
|||||||
|
|
||||||
struct HardSwishFunctor : public BaseDefaultFunctor<HardSwishFunctor>
|
struct HardSwishFunctor : public BaseDefaultFunctor<HardSwishFunctor>
|
||||||
{
|
{
|
||||||
typedef HardSwishLayer Layer;
|
using Layer = HardSwishLayer;
|
||||||
|
int vlanes;
|
||||||
|
|
||||||
|
explicit HardSwishFunctor() {
|
||||||
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||||
|
vlanes = VTraits<v_float32>::vlanes();
|
||||||
|
#else
|
||||||
|
vlanes = 1;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
bool supportBackend(int backendId, int)
|
bool supportBackend(int backendId, int)
|
||||||
{
|
{
|
||||||
@ -2002,7 +2108,32 @@ struct HardSwishFunctor : public BaseDefaultFunctor<HardSwishFunctor>
|
|||||||
|
|
||||||
inline float calculate(float x) const
|
inline float calculate(float x) const
|
||||||
{
|
{
|
||||||
return x * max(0.f, min(1.f, x / 6.f + 0.5f));
|
return x * std::max(0.f, std::min(1.f, x / 6.f + 0.5f));
|
||||||
|
}
|
||||||
|
|
||||||
|
void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const {
|
||||||
|
CV_UNUSED(stripeStart);
|
||||||
|
for (int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize) {
|
||||||
|
int i = 0;
|
||||||
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||||
|
v_float32 zero = vx_setzero_f32(), one = vx_setall_f32(1.0f),
|
||||||
|
half = vx_setall_f32(0.5f), sixth = vx_setall_f32(1 / 6.0f);
|
||||||
|
for (; i <= len - vlanes; i += vlanes) {
|
||||||
|
v_float32 x = vx_load(srcptr + i);
|
||||||
|
|
||||||
|
v_float32 t = v_add(v_mul(x, sixth), half);
|
||||||
|
t = v_min(one, t);
|
||||||
|
t = v_max(zero, t);
|
||||||
|
t = v_mul(x, t);
|
||||||
|
|
||||||
|
vx_store(dstptr + i, t);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// In case SIMD is not available or len > vlanes
|
||||||
|
for (; i < len; i++) {
|
||||||
|
dstptr[i] = calculate(srcptr[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAVE_CUDA
|
#ifdef HAVE_CUDA
|
||||||
@ -2176,11 +2307,18 @@ const char* const BaseDefaultFunctor<TanFunctor>::ocl_kernel_name = "TanForward"
|
|||||||
|
|
||||||
struct CeluFunctor : public BaseDefaultFunctor<CeluFunctor>
|
struct CeluFunctor : public BaseDefaultFunctor<CeluFunctor>
|
||||||
{
|
{
|
||||||
typedef CeluLayer Layer;
|
using Layer = CeluLayer;
|
||||||
|
|
||||||
float alpha;
|
float alpha;
|
||||||
|
int vlanes;
|
||||||
|
|
||||||
explicit CeluFunctor(float alpha_ = 1.f) : alpha(alpha_) {}
|
explicit CeluFunctor(float alpha_ = 1.f) : alpha(alpha_) {
|
||||||
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||||
|
vlanes = VTraits<v_float32>::vlanes();
|
||||||
|
#else
|
||||||
|
vlanes = 1;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
bool supportBackend(int backendId, int)
|
bool supportBackend(int backendId, int)
|
||||||
{
|
{
|
||||||
@ -2189,7 +2327,30 @@ struct CeluFunctor : public BaseDefaultFunctor<CeluFunctor>
|
|||||||
|
|
||||||
inline float calculate(float x) const
|
inline float calculate(float x) const
|
||||||
{
|
{
|
||||||
return max(0.f, x) + min(0.f, alpha * expm1(x / alpha));
|
return std::max(0.f, x) + std::min(0.f, alpha * expm1(x / alpha));
|
||||||
|
}
|
||||||
|
|
||||||
|
void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const {
|
||||||
|
CV_UNUSED(stripeStart);
|
||||||
|
for (int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize) {
|
||||||
|
int i = 0;
|
||||||
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||||
|
v_float32 zero = vx_setzero_f32(), v_alpha = vx_setall_f32(alpha),
|
||||||
|
one = vx_setall_f32(1.0f), v_ralpha = vx_setall_f32(1.0f / alpha);
|
||||||
|
for (; i <= len - vlanes; i += vlanes) {
|
||||||
|
v_float32 x = vx_load(srcptr + i);
|
||||||
|
|
||||||
|
v_float32 t = v_min(zero, v_mul(v_alpha, v_sub(v_exp(v_mul(x, v_ralpha)), one)));
|
||||||
|
t = v_add(v_max(zero, x), t);
|
||||||
|
|
||||||
|
vx_store(dstptr + i, t);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// In case SIMD is not available or len < vlanes
|
||||||
|
for (; i < len; i++) {
|
||||||
|
dstptr[i] = calculate(srcptr[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void setKernelParams(ocl::Kernel& kernel) const
|
inline void setKernelParams(ocl::Kernel& kernel) const
|
||||||
@ -2250,13 +2411,21 @@ const char* const BaseDefaultFunctor<HardSigmoidFunctor>::ocl_kernel_name = "Har
|
|||||||
|
|
||||||
struct SeluFunctor : public BaseDefaultFunctor<SeluFunctor>
|
struct SeluFunctor : public BaseDefaultFunctor<SeluFunctor>
|
||||||
{
|
{
|
||||||
typedef SeluLayer Layer;
|
using Layer = SeluLayer;
|
||||||
|
|
||||||
float alpha;
|
float alpha;
|
||||||
float gamma;
|
float gamma;
|
||||||
|
int vlanes;
|
||||||
|
|
||||||
explicit SeluFunctor(float alpha_ = 1.67326319217681884765625f,
|
explicit SeluFunctor(float alpha_ = 1.67326319217681884765625f,
|
||||||
float gamma_ = 1.05070102214813232421875f) : alpha(alpha_), gamma(gamma_) {}
|
float gamma_ = 1.05070102214813232421875f)
|
||||||
|
: alpha(alpha_), gamma(gamma_) {
|
||||||
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||||
|
vlanes = VTraits<v_float32>::vlanes();
|
||||||
|
#else
|
||||||
|
vlanes = 1;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
bool supportBackend(int backendId, int)
|
bool supportBackend(int backendId, int)
|
||||||
{
|
{
|
||||||
@ -2268,6 +2437,30 @@ struct SeluFunctor : public BaseDefaultFunctor<SeluFunctor>
|
|||||||
return gamma * (x > 0.f ? x : alpha * expm1(x));
|
return gamma * (x > 0.f ? x : alpha * expm1(x));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const {
|
||||||
|
CV_UNUSED(stripeStart);
|
||||||
|
for (int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize) {
|
||||||
|
int i = 0;
|
||||||
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||||
|
v_float32 z = vx_setzero_f32(), one = vx_setall_f32(1.0f),
|
||||||
|
v_alpha = vx_setall_f32(alpha), v_gamma = vx_setall_f32(gamma);
|
||||||
|
for (; i <= len - vlanes; i += vlanes) {
|
||||||
|
v_float32 x = vx_load(srcptr + i);
|
||||||
|
|
||||||
|
v_float32 t = v_mul(v_alpha, v_sub(v_exp(x), one));
|
||||||
|
x = v_select(v_le(x, z), t, x);
|
||||||
|
x = v_mul(v_gamma, x);
|
||||||
|
|
||||||
|
vx_store(dstptr + i, x);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// In case SIMD is not available or len > vlanes
|
||||||
|
for (; i < len; i++) {
|
||||||
|
dstptr[i] = calculate(srcptr[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
inline void setKernelParams(ocl::Kernel& kernel) const
|
inline void setKernelParams(ocl::Kernel& kernel) const
|
||||||
{
|
{
|
||||||
kernel.set(3, alpha);
|
kernel.set(3, alpha);
|
||||||
|
@ -250,7 +250,10 @@ static const TestCase testConformanceConfig[] = {
|
|||||||
{"test_einsum_transpose", 1, 1},
|
{"test_einsum_transpose", 1, 1},
|
||||||
{"test_elu", 1, 1},
|
{"test_elu", 1, 1},
|
||||||
{"test_elu_default", 1, 1},
|
{"test_elu_default", 1, 1},
|
||||||
|
{"test_elu_default_expanded_ver18", 1, 1},
|
||||||
{"test_elu_example", 1, 1},
|
{"test_elu_example", 1, 1},
|
||||||
|
{"test_elu_example_expanded_ver18", 1, 1},
|
||||||
|
{"test_elu_expanded_ver18", 1, 1},
|
||||||
{"test_equal", 2, 1},
|
{"test_equal", 2, 1},
|
||||||
{"test_equal_bcast", 2, 1},
|
{"test_equal_bcast", 2, 1},
|
||||||
{"test_erf", 1, 1},
|
{"test_erf", 1, 1},
|
||||||
@ -454,6 +457,8 @@ static const TestCase testConformanceConfig[] = {
|
|||||||
{"test_min_uint32", 2, 1},
|
{"test_min_uint32", 2, 1},
|
||||||
{"test_min_uint64", 2, 1},
|
{"test_min_uint64", 2, 1},
|
||||||
{"test_min_uint8", 2, 1},
|
{"test_min_uint8", 2, 1},
|
||||||
|
{"test_mish", 1, 1},
|
||||||
|
{"test_mish_expanded", 1, 1},
|
||||||
{"test_mod_broadcast", 2, 1},
|
{"test_mod_broadcast", 2, 1},
|
||||||
{"test_mod_int64_fmod", 2, 1},
|
{"test_mod_int64_fmod", 2, 1},
|
||||||
{"test_mod_mixed_sign_float16", 2, 1},
|
{"test_mod_mixed_sign_float16", 2, 1},
|
||||||
@ -775,7 +780,10 @@ static const TestCase testConformanceConfig[] = {
|
|||||||
{"test_sce_sum_log_prob_expanded", 2, 2},
|
{"test_sce_sum_log_prob_expanded", 2, 2},
|
||||||
{"test_selu", 1, 1},
|
{"test_selu", 1, 1},
|
||||||
{"test_selu_default", 1, 1},
|
{"test_selu_default", 1, 1},
|
||||||
|
{"test_selu_default_expanded_ver18", 1, 1},
|
||||||
{"test_selu_example", 1, 1},
|
{"test_selu_example", 1, 1},
|
||||||
|
{"test_selu_example_expanded_ver18", 1, 1},
|
||||||
|
{"test_selu_expanded_ver18", 1, 1},
|
||||||
{"test_sequence_insert_at_back", 2, 1},
|
{"test_sequence_insert_at_back", 2, 1},
|
||||||
{"test_sequence_insert_at_front", 3, 1},
|
{"test_sequence_insert_at_front", 3, 1},
|
||||||
{"test_shape", 1, 1},
|
{"test_shape", 1, 1},
|
||||||
|
@ -624,8 +624,14 @@ CASE(test_elu)
|
|||||||
// no filter
|
// no filter
|
||||||
CASE(test_elu_default)
|
CASE(test_elu_default)
|
||||||
// no filter
|
// no filter
|
||||||
|
CASE(test_elu_default_expanded_ver18)
|
||||||
|
// no filter
|
||||||
CASE(test_elu_example)
|
CASE(test_elu_example)
|
||||||
// no filter
|
// no filter
|
||||||
|
CASE(test_elu_example_expanded_ver18)
|
||||||
|
// no filter
|
||||||
|
CASE(test_elu_expanded_ver18)
|
||||||
|
// no filter
|
||||||
CASE(test_equal)
|
CASE(test_equal)
|
||||||
// no filter
|
// no filter
|
||||||
CASE(test_equal_bcast)
|
CASE(test_equal_bcast)
|
||||||
@ -1098,6 +1104,10 @@ CASE(test_min_uint64)
|
|||||||
// no filter
|
// no filter
|
||||||
CASE(test_min_uint8)
|
CASE(test_min_uint8)
|
||||||
// no filter
|
// no filter
|
||||||
|
CASE(test_mish)
|
||||||
|
// no filter
|
||||||
|
CASE(test_mish_expanded)
|
||||||
|
// no filter
|
||||||
CASE(test_mod_broadcast)
|
CASE(test_mod_broadcast)
|
||||||
// no filter
|
// no filter
|
||||||
CASE(test_mod_int64_fmod)
|
CASE(test_mod_int64_fmod)
|
||||||
@ -1851,8 +1861,14 @@ CASE(test_selu)
|
|||||||
// no filter
|
// no filter
|
||||||
CASE(test_selu_default)
|
CASE(test_selu_default)
|
||||||
// no filter
|
// no filter
|
||||||
|
CASE(test_selu_default_expanded_ver18)
|
||||||
|
// no filter
|
||||||
CASE(test_selu_example)
|
CASE(test_selu_example)
|
||||||
// no filter
|
// no filter
|
||||||
|
CASE(test_selu_example_expanded_ver18)
|
||||||
|
// no filter
|
||||||
|
CASE(test_selu_expanded_ver18)
|
||||||
|
// no filter
|
||||||
CASE(test_sequence_insert_at_back)
|
CASE(test_sequence_insert_at_back)
|
||||||
// no filter
|
// no filter
|
||||||
CASE(test_sequence_insert_at_front)
|
CASE(test_sequence_insert_at_front)
|
||||||
|
@ -103,6 +103,9 @@
|
|||||||
"test_dynamicquantizelinear_min_adjusted_expanded",
|
"test_dynamicquantizelinear_min_adjusted_expanded",
|
||||||
"test_edge_pad",
|
"test_edge_pad",
|
||||||
"test_einsum_inner_prod",
|
"test_einsum_inner_prod",
|
||||||
|
"test_elu_default_expanded_ver18",
|
||||||
|
"test_elu_example_expanded_ver18",
|
||||||
|
"test_elu_expanded_ver18",
|
||||||
"test_equal",
|
"test_equal",
|
||||||
"test_equal_bcast",
|
"test_equal_bcast",
|
||||||
"test_expand_dim_changed",
|
"test_expand_dim_changed",
|
||||||
@ -412,6 +415,9 @@
|
|||||||
"test_sce_sum_expanded",
|
"test_sce_sum_expanded",
|
||||||
"test_sce_sum_log_prob",
|
"test_sce_sum_log_prob",
|
||||||
"test_sce_sum_log_prob_expanded",
|
"test_sce_sum_log_prob_expanded",
|
||||||
|
"test_selu_default_expanded_ver18",
|
||||||
|
"test_selu_example_expanded_ver18",
|
||||||
|
"test_selu_expanded_ver18",
|
||||||
"test_sequence_insert_at_back",
|
"test_sequence_insert_at_back",
|
||||||
"test_sequence_insert_at_front",
|
"test_sequence_insert_at_front",
|
||||||
"test_shape",
|
"test_shape",
|
||||||
|
Loading…
Reference in New Issue
Block a user