opencv/modules/dnn/test/test_onnx_conformance.cpp

1347 lines
53 KiB
C++
Raw Normal View History

// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "test_precomp.hpp"
#include <set>
#include <string>
#include "npy_blob.hpp"
#include <opencv2/dnn/shape_utils.hpp>
#if defined(_MSC_VER) // workaround for 32-bit MSVC compiler
#pragma optimize("", off)
#endif
#define CV_TEST_TAG_DNN_ERROR_PARSER "dnn_error_parser"
#define CV_TEST_TAG_DNN_ERROR_NET_SETUP "dnn_error_net_setup"
#define CV_TEST_TAG_DNN_ERROR_FORWARD "dnn_error_forward"
#define CV_TEST_TAG_DNN_LAYER_FALLBACK "dnn_layer_fallback"
#define CV_TEST_TAG_DNN_NO_ACCURACY_CHECK "dnn_no_accuracy_check"
namespace opencv_test {
struct TestCase
{
const char* name;
uint32_t inputs;
uint32_t outputs;
};
static const TestCase testConformanceConfig[] = {
{"test_abs", 1, 1},
{"test_acos", 1, 1},
{"test_acos_example", 1, 1},
{"test_acosh", 1, 1},
{"test_acosh_example", 1, 1},
{"test_adagrad", 5, 2},
{"test_adagrad_multiple", 8, 4},
{"test_adam", 6, 3},
{"test_adam_multiple", 10, 6},
{"test_add", 2, 1},
{"test_add_bcast", 2, 1},
{"test_add_uint8", 2, 1},
{"test_and2d", 2, 1},
{"test_and3d", 2, 1},
{"test_and4d", 2, 1},
{"test_and_bcast3v1d", 2, 1},
{"test_and_bcast3v2d", 2, 1},
{"test_and_bcast4v2d", 2, 1},
{"test_and_bcast4v3d", 2, 1},
{"test_and_bcast4v4d", 2, 1},
{"test_argmax_default_axis_example", 1, 1},
{"test_argmax_default_axis_example_select_last_index", 1, 1},
{"test_argmax_default_axis_random", 1, 1},
{"test_argmax_default_axis_random_select_last_index", 1, 1},
{"test_argmax_keepdims_example", 1, 1},
{"test_argmax_keepdims_example_select_last_index", 1, 1},
{"test_argmax_keepdims_random", 1, 1},
{"test_argmax_keepdims_random_select_last_index", 1, 1},
{"test_argmax_negative_axis_keepdims_example", 1, 1},
{"test_argmax_negative_axis_keepdims_example_select_last_index", 1, 1},
{"test_argmax_negative_axis_keepdims_random", 1, 1},
{"test_argmax_negative_axis_keepdims_random_select_last_index", 1, 1},
{"test_argmax_no_keepdims_example", 1, 1},
{"test_argmax_no_keepdims_example_select_last_index", 1, 1},
{"test_argmax_no_keepdims_random", 1, 1},
{"test_argmax_no_keepdims_random_select_last_index", 1, 1},
{"test_argmin_default_axis_example", 1, 1},
{"test_argmin_default_axis_example_select_last_index", 1, 1},
{"test_argmin_default_axis_random", 1, 1},
{"test_argmin_default_axis_random_select_last_index", 1, 1},
{"test_argmin_keepdims_example", 1, 1},
{"test_argmin_keepdims_example_select_last_index", 1, 1},
{"test_argmin_keepdims_random", 1, 1},
{"test_argmin_keepdims_random_select_last_index", 1, 1},
{"test_argmin_negative_axis_keepdims_example", 1, 1},
{"test_argmin_negative_axis_keepdims_example_select_last_index", 1, 1},
{"test_argmin_negative_axis_keepdims_random", 1, 1},
{"test_argmin_negative_axis_keepdims_random_select_last_index", 1, 1},
{"test_argmin_no_keepdims_example", 1, 1},
{"test_argmin_no_keepdims_example_select_last_index", 1, 1},
{"test_argmin_no_keepdims_random", 1, 1},
{"test_argmin_no_keepdims_random_select_last_index", 1, 1},
{"test_asin", 1, 1},
{"test_asin_example", 1, 1},
{"test_asinh", 1, 1},
{"test_asinh_example", 1, 1},
{"test_atan", 1, 1},
{"test_atan_example", 1, 1},
{"test_atanh", 1, 1},
{"test_atanh_example", 1, 1},
{"test_averagepool_1d_default", 1, 1},
{"test_averagepool_2d_ceil", 1, 1},
{"test_averagepool_2d_default", 1, 1},
{"test_averagepool_2d_pads", 1, 1},
{"test_averagepool_2d_pads_count_include_pad", 1, 1},
{"test_averagepool_2d_precomputed_pads", 1, 1},
{"test_averagepool_2d_precomputed_pads_count_include_pad", 1, 1},
{"test_averagepool_2d_precomputed_same_upper", 1, 1},
{"test_averagepool_2d_precomputed_strides", 1, 1},
{"test_averagepool_2d_same_lower", 1, 1},
{"test_averagepool_2d_same_upper", 1, 1},
{"test_averagepool_2d_strides", 1, 1},
{"test_averagepool_3d_default", 1, 1},
{"test_basic_conv_with_padding", 2, 1},
{"test_basic_conv_without_padding", 2, 1},
{"test_basic_convinteger", 3, 1},
{"test_batchnorm_epsilon", 5, 1},
{"test_batchnorm_epsilon_training_mode", 5, 3},
{"test_batchnorm_example", 5, 1},
{"test_batchnorm_example_training_mode", 5, 3},
{"test_bernoulli", 1, 1},
{"test_bernoulli_double", 1, 1},
{"test_bernoulli_double_expanded", 1, 1},
{"test_bernoulli_expanded", 1, 1},
{"test_bernoulli_seed", 1, 1},
{"test_bernoulli_seed_expanded", 1, 1},
{"test_bitshift_left_uint16", 2, 1},
{"test_bitshift_left_uint32", 2, 1},
{"test_bitshift_left_uint64", 2, 1},
{"test_bitshift_left_uint8", 2, 1},
{"test_bitshift_right_uint16", 2, 1},
{"test_bitshift_right_uint32", 2, 1},
{"test_bitshift_right_uint64", 2, 1},
{"test_bitshift_right_uint8", 2, 1},
{"test_cast_BFLOAT16_to_FLOAT", 1, 1},
{"test_cast_DOUBLE_to_FLOAT", 1, 1},
{"test_cast_DOUBLE_to_FLOAT16", 1, 1},
{"test_cast_FLOAT16_to_DOUBLE", 1, 1},
{"test_cast_FLOAT16_to_FLOAT", 1, 1},
{"test_cast_FLOAT_to_BFLOAT16", 1, 1},
{"test_cast_FLOAT_to_DOUBLE", 1, 1},
{"test_cast_FLOAT_to_FLOAT16", 1, 1},
{"test_cast_FLOAT_to_STRING", 1, 1},
{"test_cast_STRING_to_FLOAT", 1, 1},
{"test_castlike_BFLOAT16_to_FLOAT", 2, 1},
{"test_castlike_BFLOAT16_to_FLOAT_expanded", 2, 1},
{"test_castlike_DOUBLE_to_FLOAT", 2, 1},
{"test_castlike_DOUBLE_to_FLOAT16", 2, 1},
{"test_castlike_DOUBLE_to_FLOAT16_expanded", 2, 1},
{"test_castlike_DOUBLE_to_FLOAT_expanded", 2, 1},
{"test_castlike_FLOAT16_to_DOUBLE", 2, 1},
{"test_castlike_FLOAT16_to_DOUBLE_expanded", 2, 1},
{"test_castlike_FLOAT16_to_FLOAT", 2, 1},
{"test_castlike_FLOAT16_to_FLOAT_expanded", 2, 1},
{"test_castlike_FLOAT_to_BFLOAT16", 2, 1},
{"test_castlike_FLOAT_to_BFLOAT16_expanded", 2, 1},
{"test_castlike_FLOAT_to_DOUBLE", 2, 1},
{"test_castlike_FLOAT_to_DOUBLE_expanded", 2, 1},
{"test_castlike_FLOAT_to_FLOAT16", 2, 1},
{"test_castlike_FLOAT_to_FLOAT16_expanded", 2, 1},
{"test_castlike_FLOAT_to_STRING", 2, 1},
{"test_castlike_FLOAT_to_STRING_expanded", 2, 1},
{"test_castlike_STRING_to_FLOAT", 2, 1},
{"test_castlike_STRING_to_FLOAT_expanded", 2, 1},
{"test_ceil", 1, 1},
{"test_ceil_example", 1, 1},
{"test_celu", 1, 1},
{"test_celu_expanded", 1, 1},
{"test_clip", 3, 1},
{"test_clip_default_inbounds", 1, 1},
{"test_clip_default_int8_inbounds", 1, 1},
{"test_clip_default_int8_max", 2, 1},
{"test_clip_default_int8_min", 2, 1},
{"test_clip_default_max", 2, 1},
{"test_clip_default_min", 2, 1},
{"test_clip_example", 3, 1},
{"test_clip_inbounds", 3, 1},
{"test_clip_outbounds", 3, 1},
{"test_clip_splitbounds", 3, 1},
{"test_compress_0", 2, 1},
{"test_compress_1", 2, 1},
{"test_compress_default_axis", 2, 1},
{"test_compress_negative_axis", 2, 1},
{"test_concat_1d_axis_0", 2, 1},
{"test_concat_1d_axis_negative_1", 2, 1},
{"test_concat_2d_axis_0", 2, 1},
{"test_concat_2d_axis_1", 2, 1},
{"test_concat_2d_axis_negative_1", 2, 1},
{"test_concat_2d_axis_negative_2", 2, 1},
{"test_concat_3d_axis_0", 2, 1},
{"test_concat_3d_axis_1", 2, 1},
{"test_concat_3d_axis_2", 2, 1},
{"test_concat_3d_axis_negative_1", 2, 1},
{"test_concat_3d_axis_negative_2", 2, 1},
{"test_concat_3d_axis_negative_3", 2, 1},
{"test_constant", 0, 1},
{"test_constant_pad", 3, 1},
{"test_constantofshape_float_ones", 1, 1},
{"test_constantofshape_int_shape_zero", 1, 1},
{"test_constantofshape_int_zeros", 1, 1},
{"test_conv_with_autopad_same", 2, 1},
{"test_conv_with_strides_and_asymmetric_padding", 2, 1},
{"test_conv_with_strides_no_padding", 2, 1},
{"test_conv_with_strides_padding", 2, 1},
{"test_convinteger_with_padding", 3, 1},
{"test_convinteger_without_padding", 3, 1},
{"test_convtranspose", 2, 1},
{"test_convtranspose_1d", 2, 1},
{"test_convtranspose_3d", 2, 1},
{"test_convtranspose_autopad_same", 2, 1},
{"test_convtranspose_dilations", 2, 1},
{"test_convtranspose_kernel_shape", 2, 1},
{"test_convtranspose_output_shape", 2, 1},
{"test_convtranspose_pad", 2, 1},
{"test_convtranspose_pads", 2, 1},
{"test_convtranspose_with_kernel", 2, 1},
{"test_cos", 1, 1},
{"test_cos_example", 1, 1},
{"test_cosh", 1, 1},
{"test_cosh_example", 1, 1},
{"test_cumsum_1d", 2, 1},
{"test_cumsum_1d_exclusive", 2, 1},
{"test_cumsum_1d_reverse", 2, 1},
{"test_cumsum_1d_reverse_exclusive", 2, 1},
{"test_cumsum_2d_axis_0", 2, 1},
{"test_cumsum_2d_axis_1", 2, 1},
{"test_cumsum_2d_negative_axis", 2, 1},
{"test_depthtospace_crd_mode", 1, 1},
{"test_depthtospace_crd_mode_example", 1, 1},
{"test_depthtospace_dcr_mode", 1, 1},
{"test_depthtospace_example", 1, 1},
{"test_dequantizelinear", 3, 1},
{"test_dequantizelinear_axis", 3, 1},
{"test_det_2d", 1, 1},
{"test_det_nd", 1, 1},
{"test_div", 2, 1},
{"test_div_bcast", 2, 1},
{"test_div_example", 2, 1},
{"test_div_uint8", 2, 1},
{"test_dropout_default", 1, 1},
{"test_dropout_default_mask", 1, 2},
{"test_dropout_default_mask_ratio", 2, 2},
{"test_dropout_default_old", 1, 1},
{"test_dropout_default_ratio", 2, 1},
{"test_dropout_random_old", 1, 1},
{"test_dynamicquantizelinear", 1, 3},
{"test_dynamicquantizelinear_expanded", 1, 3},
{"test_dynamicquantizelinear_max_adjusted", 1, 3},
{"test_dynamicquantizelinear_max_adjusted_expanded", 1, 3},
{"test_dynamicquantizelinear_min_adjusted", 1, 3},
{"test_dynamicquantizelinear_min_adjusted_expanded", 1, 3},
{"test_edge_pad", 2, 1},
{"test_einsum_batch_diagonal", 1, 1},
{"test_einsum_batch_matmul", 2, 1},
{"test_einsum_inner_prod", 2, 1},
{"test_einsum_sum", 1, 1},
{"test_einsum_transpose", 1, 1},
{"test_elu", 1, 1},
{"test_elu_default", 1, 1},
Merge pull request #25881 from fengyuentau:dnn/cpu/optimize_activations_with_v_exp dnn: optimize activations with v_exp #25881 Merge with https://github.com/opencv/opencv_extra/pull/1191. This PR optimizes the following activations: - [x] Swish - [x] Mish - [x] Elu - [x] Celu - [x] Selu - [x] HardSwish ### Performance (Updated on 2024-07-18) #### AmLogic A311D2 (ARM Cortex A73 + A53) ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 115.859 27.930 4.15 Elu::Layer_Elementwise::OCV/CPU 27.846 27.003 1.03 Gelu::Layer_Elementwise::OCV/CPU 0.657 0.602 1.09 HardSwish::Layer_Elementwise::OCV/CPU 31.885 6.781 4.70 Mish::Layer_Elementwise::OCV/CPU 35.729 32.089 1.11 Selu::Layer_Elementwise::OCV/CPU 61.955 27.850 2.22 Swish::Layer_Elementwise::OCV/CPU 30.819 26.688 1.15 ``` #### Apple M1 ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 16.184 2.118 7.64 Celu::Layer_Elementwise::OCV/CPU_FP16 16.280 2.123 7.67 Elu::Layer_Elementwise::OCV/CPU 9.123 1.878 4.86 Elu::Layer_Elementwise::OCV/CPU_FP16 9.085 1.897 4.79 Gelu::Layer_Elementwise::OCV/CPU 0.089 0.081 1.11 Gelu::Layer_Elementwise::OCV/CPU_FP16 0.086 0.074 1.17 HardSwish::Layer_Elementwise::OCV/CPU 1.560 1.555 1.00 HardSwish::Layer_Elementwise::OCV/CPU_FP16 1.536 1.523 1.01 Mish::Layer_Elementwise::OCV/CPU 6.077 2.476 2.45 Mish::Layer_Elementwise::OCV/CPU_FP16 5.990 2.496 2.40 Selu::Layer_Elementwise::OCV/CPU 11.351 1.976 5.74 Selu::Layer_Elementwise::OCV/CPU_FP16 11.533 1.985 5.81 Swish::Layer_Elementwise::OCV/CPU 4.687 1.890 2.48 Swish::Layer_Elementwise::OCV/CPU_FP16 4.715 1.873 2.52 ``` #### Intel i7-12700K ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 17.106 3.560 4.81 Elu::Layer_Elementwise::OCV/CPU 5.064 3.478 1.46 Gelu::Layer_Elementwise::OCV/CPU 0.036 0.035 1.04 HardSwish::Layer_Elementwise::OCV/CPU 2.914 2.893 1.01 Mish::Layer_Elementwise::OCV/CPU 3.820 3.529 1.08 Selu::Layer_Elementwise::OCV/CPU 10.799 3.593 3.01 Swish::Layer_Elementwise::OCV/CPU 3.651 3.473 1.05 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2024-07-19 21:03:19 +08:00
{"test_elu_default_expanded_ver18", 1, 1},
{"test_elu_example", 1, 1},
Merge pull request #25881 from fengyuentau:dnn/cpu/optimize_activations_with_v_exp dnn: optimize activations with v_exp #25881 Merge with https://github.com/opencv/opencv_extra/pull/1191. This PR optimizes the following activations: - [x] Swish - [x] Mish - [x] Elu - [x] Celu - [x] Selu - [x] HardSwish ### Performance (Updated on 2024-07-18) #### AmLogic A311D2 (ARM Cortex A73 + A53) ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 115.859 27.930 4.15 Elu::Layer_Elementwise::OCV/CPU 27.846 27.003 1.03 Gelu::Layer_Elementwise::OCV/CPU 0.657 0.602 1.09 HardSwish::Layer_Elementwise::OCV/CPU 31.885 6.781 4.70 Mish::Layer_Elementwise::OCV/CPU 35.729 32.089 1.11 Selu::Layer_Elementwise::OCV/CPU 61.955 27.850 2.22 Swish::Layer_Elementwise::OCV/CPU 30.819 26.688 1.15 ``` #### Apple M1 ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 16.184 2.118 7.64 Celu::Layer_Elementwise::OCV/CPU_FP16 16.280 2.123 7.67 Elu::Layer_Elementwise::OCV/CPU 9.123 1.878 4.86 Elu::Layer_Elementwise::OCV/CPU_FP16 9.085 1.897 4.79 Gelu::Layer_Elementwise::OCV/CPU 0.089 0.081 1.11 Gelu::Layer_Elementwise::OCV/CPU_FP16 0.086 0.074 1.17 HardSwish::Layer_Elementwise::OCV/CPU 1.560 1.555 1.00 HardSwish::Layer_Elementwise::OCV/CPU_FP16 1.536 1.523 1.01 Mish::Layer_Elementwise::OCV/CPU 6.077 2.476 2.45 Mish::Layer_Elementwise::OCV/CPU_FP16 5.990 2.496 2.40 Selu::Layer_Elementwise::OCV/CPU 11.351 1.976 5.74 Selu::Layer_Elementwise::OCV/CPU_FP16 11.533 1.985 5.81 Swish::Layer_Elementwise::OCV/CPU 4.687 1.890 2.48 Swish::Layer_Elementwise::OCV/CPU_FP16 4.715 1.873 2.52 ``` #### Intel i7-12700K ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 17.106 3.560 4.81 Elu::Layer_Elementwise::OCV/CPU 5.064 3.478 1.46 Gelu::Layer_Elementwise::OCV/CPU 0.036 0.035 1.04 HardSwish::Layer_Elementwise::OCV/CPU 2.914 2.893 1.01 Mish::Layer_Elementwise::OCV/CPU 3.820 3.529 1.08 Selu::Layer_Elementwise::OCV/CPU 10.799 3.593 3.01 Swish::Layer_Elementwise::OCV/CPU 3.651 3.473 1.05 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2024-07-19 21:03:19 +08:00
{"test_elu_example_expanded_ver18", 1, 1},
{"test_elu_expanded_ver18", 1, 1},
{"test_equal", 2, 1},
{"test_equal_bcast", 2, 1},
{"test_erf", 1, 1},
{"test_exp", 1, 1},
{"test_exp_example", 1, 1},
{"test_expand_dim_changed", 2, 1},
{"test_expand_dim_unchanged", 2, 1},
{"test_eyelike_populate_off_main_diagonal", 1, 1},
{"test_eyelike_with_dtype", 1, 1},
{"test_eyelike_without_dtype", 1, 1},
{"test_flatten_axis0", 1, 1},
{"test_flatten_axis1", 1, 1},
{"test_flatten_axis2", 1, 1},
{"test_flatten_axis3", 1, 1},
{"test_flatten_default_axis", 1, 1},
{"test_flatten_negative_axis1", 1, 1},
{"test_flatten_negative_axis2", 1, 1},
{"test_flatten_negative_axis3", 1, 1},
{"test_flatten_negative_axis4", 1, 1},
{"test_floor", 1, 1},
{"test_floor_example", 1, 1},
{"test_gather_0", 2, 1},
{"test_gather_1", 2, 1},
{"test_gather_2d_indices", 2, 1},
{"test_gather_elements_0", 2, 1},
{"test_gather_elements_1", 2, 1},
{"test_gather_elements_negative_indices", 2, 1},
{"test_gather_negative_indices", 2, 1},
{"test_gathernd_example_float32", 2, 1},
{"test_gathernd_example_int32", 2, 1},
{"test_gathernd_example_int32_batch_dim1", 2, 1},
{"test_gelu_default_1", 1, 1},
{"test_gelu_default_1_expanded", 1, 1},
{"test_gelu_default_2", 1, 1},
{"test_gelu_default_2_expanded", 1, 1},
{"test_gelu_tanh_1", 1, 1},
{"test_gelu_tanh_1_expanded", 1, 1},
{"test_gelu_tanh_2", 1, 1},
{"test_gelu_tanh_2_expanded", 1, 1},
{"test_gemm_all_attributes", 3, 1},
{"test_gemm_alpha", 3, 1},
{"test_gemm_beta", 3, 1},
{"test_gemm_default_matrix_bias", 3, 1},
{"test_gemm_default_no_bias", 2, 1},
{"test_gemm_default_scalar_bias", 3, 1},
{"test_gemm_default_single_elem_vector_bias", 3, 1},
{"test_gemm_default_vector_bias", 3, 1},
{"test_gemm_default_zero_bias", 3, 1},
{"test_gemm_transposeA", 3, 1},
{"test_gemm_transposeB", 3, 1},
{"test_globalaveragepool", 1, 1},
{"test_globalaveragepool_precomputed", 1, 1},
{"test_globalmaxpool", 1, 1},
{"test_globalmaxpool_precomputed", 1, 1},
{"test_greater", 2, 1},
{"test_greater_bcast", 2, 1},
{"test_greater_equal", 2, 1},
{"test_greater_equal_bcast", 2, 1},
{"test_greater_equal_bcast_expanded", 2, 1},
{"test_greater_equal_expanded", 2, 1},
{"test_gridsample", 2, 1},
{"test_gridsample_aligncorners_true", 2, 1},
{"test_gridsample_bicubic", 2, 1},
{"test_gridsample_bilinear", 2, 1},
{"test_gridsample_border_padding", 2, 1},
{"test_gridsample_nearest", 2, 1},
{"test_gridsample_reflection_padding", 2, 1},
{"test_gridsample_zeros_padding", 2, 1},
{"test_group_normalization_epsilon", 3, 1},
{"test_group_normalization_example", 3, 1},
{"test_gru_batchwise", 3, 2},
{"test_gru_defaults", 3, 1},
{"test_gru_seq_length", 4, 1},
{"test_gru_with_initial_bias", 4, 1},
{"test_hardmax_axis_0", 1, 1},
{"test_hardmax_axis_1", 1, 1},
{"test_hardmax_axis_2", 1, 1},
{"test_hardmax_default_axis", 1, 1},
{"test_hardmax_example", 1, 1},
{"test_hardmax_negative_axis", 1, 1},
{"test_hardmax_one_hot", 1, 1},
{"test_hardsigmoid", 1, 1},
{"test_hardsigmoid_default", 1, 1},
{"test_hardsigmoid_example", 1, 1},
{"test_hardswish", 1, 1},
{"test_hardswish_expanded", 1, 1},
{"test_identity", 1, 1},
{"test_identity_opt", 1, 1},
{"test_identity_sequence", 1, 1},
{"test_if", 1, 1},
{"test_if_opt", 1, 1},
{"test_if_seq", 1, 1},
{"test_instancenorm_epsilon", 3, 1},
{"test_instancenorm_example", 3, 1},
{"test_isinf", 1, 1},
{"test_isinf_negative", 1, 1},
{"test_isinf_positive", 1, 1},
{"test_isnan", 1, 1},
Merge pull request #24544 from fengyuentau:layernorm_conformance dnn test: move layer norm tests into conformance tests #24544 Merge with https://github.com/opencv/opencv_extra/pull/1122 ## Motivation Some ONNX operators, such as `LayerNormalization`, `BatchNormalization` and so on, produce outputs for training (mean, stdev). So they have reference outputs of conformance tests for those training outputs as well. However, when it comes to inference, we do not need and produce those outputs for training here in dnn. Hence, output size does not match if we use dnn to infer those conformance models. This has become the barrier if we want to test these operators using their conformance tests. <!-- | Operator | Inference needed | Outputs (required - total) | Optional outputs for training? | | ----------------------- | ----------------------------------- | -------------------------- | ------------------------------ | | BatchNormalization | Yes | 1 - 3 | Yes | | Dropout | Maybe, can be eliminated via fusion | 1 - 2 | Yes | | GRU | Yes | 0 - 2 | No | | LSTM | Yes | 0 - 3 | No | | LayerNormalization | Yes | 1 - 3 | Yes | | MaxPool | Yes | 1 - 2 | Yes | | RNN | Yes | 0 - 2 | No | | SoftmaxCrossEntropyLoss | No | 1 - 2 | -- | --> **I checked all ONNX operators with optional outputs. Turns out there are only `BatchNormalization`, `Dropout`, `LayerNormalization` and `MaxPool` has optional outputs for training. All except `LayerNormalization` have models set for training mode and eval mode. Blame ONNX for that.** ## Solution In this pull request, we remove graph outputs if the graph looks like the following: ``` [X] [Scale] [Bias] [X] [Scale] [Bias] \ | / this patch \ | / LayerNormalization -----------> LayerNormalization / | \ | [Y] [Mean] [Stdev] [Y] ``` We can update conformance tests and turn on some cases as well if extending to more layers. Notes: 1. This workaround does not solve expanded function operators if they are fused into a single operator, such as `$onnx/onnx/backend/test/data/node/test_layer_normalization_2d_axis1_expanded`, but they can be run without fusion. Note that either dnn or onnxruntime does not fuse those expanded function operators. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2023-11-20 16:19:24 +08:00
{"test_layer_normalization_2d_axis0", 3, 1},
{"test_layer_normalization_2d_axis1", 3, 1},
{"test_layer_normalization_2d_axis_negative_1", 3, 1},
{"test_layer_normalization_2d_axis_negative_2", 3, 1},
{"test_layer_normalization_3d_axis0_epsilon", 3, 1},
{"test_layer_normalization_3d_axis1_epsilon", 3, 1},
{"test_layer_normalization_3d_axis2_epsilon", 3, 1},
{"test_layer_normalization_3d_axis_negative_1_epsilon", 3, 1},
{"test_layer_normalization_3d_axis_negative_2_epsilon", 3, 1},
{"test_layer_normalization_3d_axis_negative_3_epsilon", 3, 1},
{"test_layer_normalization_4d_axis0", 3, 1},
{"test_layer_normalization_4d_axis1", 3, 1},
{"test_layer_normalization_4d_axis2", 3, 1},
{"test_layer_normalization_4d_axis3", 3, 1},
{"test_layer_normalization_4d_axis_negative_1", 3, 1},
{"test_layer_normalization_4d_axis_negative_2", 3, 1},
{"test_layer_normalization_4d_axis_negative_3", 3, 1},
{"test_layer_normalization_4d_axis_negative_4", 3, 1},
{"test_layer_normalization_default_axis", 3, 1},
{"test_leakyrelu", 1, 1},
{"test_leakyrelu_default", 1, 1},
{"test_leakyrelu_example", 1, 1},
{"test_less", 2, 1},
{"test_less_bcast", 2, 1},
{"test_less_equal", 2, 1},
{"test_less_equal_bcast", 2, 1},
{"test_less_equal_bcast_expanded", 2, 1},
{"test_less_equal_expanded", 2, 1},
{"test_log", 1, 1},
{"test_log_example", 1, 1},
{"test_logsoftmax_axis_0", 1, 1},
{"test_logsoftmax_axis_0_expanded", 1, 1},
{"test_logsoftmax_axis_1", 1, 1},
{"test_logsoftmax_axis_1_expanded", 1, 1},
{"test_logsoftmax_axis_2", 1, 1},
{"test_logsoftmax_axis_2_expanded", 1, 1},
{"test_logsoftmax_default_axis", 1, 1},
{"test_logsoftmax_default_axis_expanded", 1, 1},
{"test_logsoftmax_example_1", 1, 1},
{"test_logsoftmax_example_1_expanded", 1, 1},
{"test_logsoftmax_large_number", 1, 1},
{"test_logsoftmax_large_number_expanded", 1, 1},
{"test_logsoftmax_negative_axis", 1, 1},
{"test_logsoftmax_negative_axis_expanded", 1, 1},
{"test_loop11", 3, 2},
{"test_loop13_seq", 3, 1},
{"test_loop16_seq_none", 3, 1},
{"test_lrn", 1, 1},
{"test_lrn_default", 1, 1},
{"test_lstm_batchwise", 3, 2},
{"test_lstm_defaults", 3, 1},
{"test_lstm_with_initial_bias", 4, 1},
{"test_lstm_with_peepholes", 8, 1},
{"test_matmul_2d", 2, 1},
{"test_matmul_3d", 2, 1},
{"test_matmul_4d", 2, 1},
{"test_matmulinteger", 4, 1},
{"test_max_example", 3, 1},
{"test_max_float16", 2, 1},
{"test_max_float32", 2, 1},
{"test_max_float64", 2, 1},
{"test_max_int16", 2, 1},
{"test_max_int32", 2, 1},
{"test_max_int64", 2, 1},
{"test_max_int8", 2, 1},
{"test_max_one_input", 1, 1},
{"test_max_two_inputs", 2, 1},
{"test_max_uint16", 2, 1},
{"test_max_uint32", 2, 1},
{"test_max_uint64", 2, 1},
{"test_max_uint8", 2, 1},
{"test_maxpool_1d_default", 1, 1},
{"test_maxpool_2d_ceil", 1, 1},
{"test_maxpool_2d_default", 1, 1},
{"test_maxpool_2d_dilations", 1, 1},
{"test_maxpool_2d_pads", 1, 1},
{"test_maxpool_2d_precomputed_pads", 1, 1},
{"test_maxpool_2d_precomputed_same_upper", 1, 1},
{"test_maxpool_2d_precomputed_strides", 1, 1},
{"test_maxpool_2d_same_lower", 1, 1},
{"test_maxpool_2d_same_upper", 1, 1},
{"test_maxpool_2d_strides", 1, 1},
{"test_maxpool_2d_uint8", 1, 1},
{"test_maxpool_3d_default", 1, 1},
{"test_maxpool_with_argmax_2d_precomputed_pads", 1, 2},
{"test_maxpool_with_argmax_2d_precomputed_strides", 1, 2},
{"test_maxunpool_export_with_output_shape", 3, 1},
{"test_maxunpool_export_without_output_shape", 2, 1},
{"test_mean_example", 3, 1},
{"test_mean_one_input", 1, 1},
{"test_mean_two_inputs", 2, 1},
{"test_min_example", 3, 1},
{"test_min_float16", 2, 1},
{"test_min_float32", 2, 1},
{"test_min_float64", 2, 1},
{"test_min_int16", 2, 1},
{"test_min_int32", 2, 1},
{"test_min_int64", 2, 1},
{"test_min_int8", 2, 1},
{"test_min_one_input", 1, 1},
{"test_min_two_inputs", 2, 1},
{"test_min_uint16", 2, 1},
{"test_min_uint32", 2, 1},
{"test_min_uint64", 2, 1},
{"test_min_uint8", 2, 1},
Merge pull request #25881 from fengyuentau:dnn/cpu/optimize_activations_with_v_exp dnn: optimize activations with v_exp #25881 Merge with https://github.com/opencv/opencv_extra/pull/1191. This PR optimizes the following activations: - [x] Swish - [x] Mish - [x] Elu - [x] Celu - [x] Selu - [x] HardSwish ### Performance (Updated on 2024-07-18) #### AmLogic A311D2 (ARM Cortex A73 + A53) ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 115.859 27.930 4.15 Elu::Layer_Elementwise::OCV/CPU 27.846 27.003 1.03 Gelu::Layer_Elementwise::OCV/CPU 0.657 0.602 1.09 HardSwish::Layer_Elementwise::OCV/CPU 31.885 6.781 4.70 Mish::Layer_Elementwise::OCV/CPU 35.729 32.089 1.11 Selu::Layer_Elementwise::OCV/CPU 61.955 27.850 2.22 Swish::Layer_Elementwise::OCV/CPU 30.819 26.688 1.15 ``` #### Apple M1 ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 16.184 2.118 7.64 Celu::Layer_Elementwise::OCV/CPU_FP16 16.280 2.123 7.67 Elu::Layer_Elementwise::OCV/CPU 9.123 1.878 4.86 Elu::Layer_Elementwise::OCV/CPU_FP16 9.085 1.897 4.79 Gelu::Layer_Elementwise::OCV/CPU 0.089 0.081 1.11 Gelu::Layer_Elementwise::OCV/CPU_FP16 0.086 0.074 1.17 HardSwish::Layer_Elementwise::OCV/CPU 1.560 1.555 1.00 HardSwish::Layer_Elementwise::OCV/CPU_FP16 1.536 1.523 1.01 Mish::Layer_Elementwise::OCV/CPU 6.077 2.476 2.45 Mish::Layer_Elementwise::OCV/CPU_FP16 5.990 2.496 2.40 Selu::Layer_Elementwise::OCV/CPU 11.351 1.976 5.74 Selu::Layer_Elementwise::OCV/CPU_FP16 11.533 1.985 5.81 Swish::Layer_Elementwise::OCV/CPU 4.687 1.890 2.48 Swish::Layer_Elementwise::OCV/CPU_FP16 4.715 1.873 2.52 ``` #### Intel i7-12700K ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 17.106 3.560 4.81 Elu::Layer_Elementwise::OCV/CPU 5.064 3.478 1.46 Gelu::Layer_Elementwise::OCV/CPU 0.036 0.035 1.04 HardSwish::Layer_Elementwise::OCV/CPU 2.914 2.893 1.01 Mish::Layer_Elementwise::OCV/CPU 3.820 3.529 1.08 Selu::Layer_Elementwise::OCV/CPU 10.799 3.593 3.01 Swish::Layer_Elementwise::OCV/CPU 3.651 3.473 1.05 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2024-07-19 21:03:19 +08:00
{"test_mish", 1, 1},
{"test_mish_expanded", 1, 1},
{"test_mod_broadcast", 2, 1},
{"test_mod_int64_fmod", 2, 1},
{"test_mod_mixed_sign_float16", 2, 1},
{"test_mod_mixed_sign_float32", 2, 1},
{"test_mod_mixed_sign_float64", 2, 1},
{"test_mod_mixed_sign_int16", 2, 1},
{"test_mod_mixed_sign_int32", 2, 1},
{"test_mod_mixed_sign_int64", 2, 1},
{"test_mod_mixed_sign_int8", 2, 1},
{"test_mod_uint16", 2, 1},
{"test_mod_uint32", 2, 1},
{"test_mod_uint64", 2, 1},
{"test_mod_uint8", 2, 1},
{"test_momentum", 5, 2},
{"test_momentum_multiple", 8, 4},
{"test_mul", 2, 1},
{"test_mul_bcast", 2, 1},
{"test_mul_example", 2, 1},
{"test_mul_uint8", 2, 1},
{"test_mvn", 1, 1},
{"test_mvn_expanded", 1, 1},
{"test_neg", 1, 1},
{"test_neg_example", 1, 1},
{"test_nesterov_momentum", 5, 2},
{"test_nllloss_NC", 2, 1},
{"test_nllloss_NC_expanded", 2, 1},
{"test_nllloss_NCd1", 2, 1},
{"test_nllloss_NCd1_expanded", 2, 1},
{"test_nllloss_NCd1_ii", 2, 1},
{"test_nllloss_NCd1_ii_expanded", 2, 1},
{"test_nllloss_NCd1_mean_weight_negative_ii", 3, 1},
{"test_nllloss_NCd1_mean_weight_negative_ii_expanded", 3, 1},
{"test_nllloss_NCd1_weight", 3, 1},
{"test_nllloss_NCd1_weight_expanded", 3, 1},
{"test_nllloss_NCd1_weight_ii", 3, 1},
{"test_nllloss_NCd1_weight_ii_expanded", 3, 1},
{"test_nllloss_NCd1d2", 2, 1},
{"test_nllloss_NCd1d2_expanded", 2, 1},
{"test_nllloss_NCd1d2_no_weight_reduction_mean_ii", 2, 1},
{"test_nllloss_NCd1d2_no_weight_reduction_mean_ii_expanded", 2, 1},
{"test_nllloss_NCd1d2_reduction_mean", 2, 1},
{"test_nllloss_NCd1d2_reduction_mean_expanded", 2, 1},
{"test_nllloss_NCd1d2_reduction_sum", 2, 1},
{"test_nllloss_NCd1d2_reduction_sum_expanded", 2, 1},
{"test_nllloss_NCd1d2_with_weight", 3, 1},
{"test_nllloss_NCd1d2_with_weight_expanded", 3, 1},
{"test_nllloss_NCd1d2_with_weight_reduction_mean", 3, 1},
{"test_nllloss_NCd1d2_with_weight_reduction_mean_expanded", 3, 1},
{"test_nllloss_NCd1d2_with_weight_reduction_sum", 3, 1},
{"test_nllloss_NCd1d2_with_weight_reduction_sum_expanded", 3, 1},
{"test_nllloss_NCd1d2_with_weight_reduction_sum_ii", 3, 1},
{"test_nllloss_NCd1d2_with_weight_reduction_sum_ii_expanded", 3, 1},
{"test_nllloss_NCd1d2d3_none_no_weight_negative_ii", 2, 1},
{"test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded", 2, 1},
{"test_nllloss_NCd1d2d3_sum_weight_high_ii", 3, 1},
{"test_nllloss_NCd1d2d3_sum_weight_high_ii_expanded", 3, 1},
{"test_nllloss_NCd1d2d3d4d5_mean_weight", 3, 1},
{"test_nllloss_NCd1d2d3d4d5_mean_weight_expanded", 3, 1},
{"test_nllloss_NCd1d2d3d4d5_none_no_weight", 2, 1},
{"test_nllloss_NCd1d2d3d4d5_none_no_weight_expanded", 2, 1},
{"test_nonmaxsuppression_center_point_box_format", 5, 1},
{"test_nonmaxsuppression_flipped_coordinates", 5, 1},
{"test_nonmaxsuppression_identical_boxes", 5, 1},
{"test_nonmaxsuppression_limit_output_size", 5, 1},
{"test_nonmaxsuppression_single_box", 5, 1},
{"test_nonmaxsuppression_suppress_by_IOU", 5, 1},
{"test_nonmaxsuppression_suppress_by_IOU_and_scores", 5, 1},
{"test_nonmaxsuppression_two_batches", 5, 1},
{"test_nonmaxsuppression_two_classes", 5, 1},
{"test_nonzero_example", 1, 1},
{"test_not_2d", 1, 1},
{"test_not_3d", 1, 1},
{"test_not_4d", 1, 1},
{"test_onehot_negative_indices", 3, 1},
{"test_onehot_with_axis", 3, 1},
{"test_onehot_with_negative_axis", 3, 1},
{"test_onehot_without_axis", 3, 1},
{"test_optional_get_element", 1, 1},
{"test_optional_get_element_sequence", 1, 1},
{"test_optional_has_element", 1, 1},
{"test_optional_has_element_empty", 1, 1},
{"test_or2d", 2, 1},
{"test_or3d", 2, 1},
{"test_or4d", 2, 1},
{"test_or_bcast3v1d", 2, 1},
{"test_or_bcast3v2d", 2, 1},
{"test_or_bcast4v2d", 2, 1},
{"test_or_bcast4v3d", 2, 1},
{"test_or_bcast4v4d", 2, 1},
{"test_pow", 2, 1},
{"test_pow_bcast_array", 2, 1},
{"test_pow_bcast_scalar", 2, 1},
{"test_pow_example", 2, 1},
{"test_pow_types_float", 2, 1},
{"test_pow_types_float32_int32", 2, 1},
{"test_pow_types_float32_int64", 2, 1},
{"test_pow_types_float32_uint32", 2, 1},
{"test_pow_types_float32_uint64", 2, 1},
{"test_pow_types_int", 2, 1},
{"test_pow_types_int32_float32", 2, 1},
{"test_pow_types_int32_int32", 2, 1},
{"test_pow_types_int64_float32", 2, 1},
{"test_pow_types_int64_int64", 2, 1},
{"test_prelu_broadcast", 2, 1},
{"test_prelu_example", 2, 1},
{"test_qlinearconv", 8, 1},
{"test_qlinearmatmul_2D", 8, 1},
{"test_qlinearmatmul_3D", 8, 1},
{"test_quantizelinear", 3, 1},
{"test_quantizelinear_axis", 3, 1},
{"test_range_float_type_positive_delta", 3, 1},
{"test_range_float_type_positive_delta_expanded", 3, 1},
{"test_range_int32_type_negative_delta", 3, 1},
{"test_range_int32_type_negative_delta_expanded", 3, 1},
{"test_reciprocal", 1, 1},
{"test_reciprocal_example", 1, 1},
{"test_reduce_l1_default_axes_keepdims_example", 1, 1},
{"test_reduce_l1_default_axes_keepdims_random", 1, 1},
{"test_reduce_l1_do_not_keepdims_example", 1, 1},
{"test_reduce_l1_do_not_keepdims_random", 1, 1},
{"test_reduce_l1_keep_dims_example", 1, 1},
{"test_reduce_l1_keep_dims_random", 1, 1},
{"test_reduce_l1_negative_axes_keep_dims_example", 1, 1},
{"test_reduce_l1_negative_axes_keep_dims_random", 1, 1},
{"test_reduce_l2_default_axes_keepdims_example", 1, 1},
{"test_reduce_l2_default_axes_keepdims_random", 1, 1},
{"test_reduce_l2_do_not_keepdims_example", 1, 1},
{"test_reduce_l2_do_not_keepdims_random", 1, 1},
{"test_reduce_l2_keep_dims_example", 1, 1},
{"test_reduce_l2_keep_dims_random", 1, 1},
{"test_reduce_l2_negative_axes_keep_dims_example", 1, 1},
{"test_reduce_l2_negative_axes_keep_dims_random", 1, 1},
{"test_reduce_log_sum", 1, 1},
{"test_reduce_log_sum_asc_axes", 1, 1},
{"test_reduce_log_sum_default", 1, 1},
{"test_reduce_log_sum_desc_axes", 1, 1},
{"test_reduce_log_sum_exp_default_axes_keepdims_example", 1, 1},
{"test_reduce_log_sum_exp_default_axes_keepdims_random", 1, 1},
{"test_reduce_log_sum_exp_do_not_keepdims_example", 1, 1},
{"test_reduce_log_sum_exp_do_not_keepdims_random", 1, 1},
{"test_reduce_log_sum_exp_keepdims_example", 1, 1},
{"test_reduce_log_sum_exp_keepdims_random", 1, 1},
{"test_reduce_log_sum_exp_negative_axes_keepdims_example", 1, 1},
{"test_reduce_log_sum_exp_negative_axes_keepdims_random", 1, 1},
{"test_reduce_log_sum_negative_axes", 1, 1},
{"test_reduce_max_default_axes_keepdim_example", 1, 1},
{"test_reduce_max_default_axes_keepdims_random", 1, 1},
{"test_reduce_max_do_not_keepdims_example", 1, 1},
{"test_reduce_max_do_not_keepdims_random", 1, 1},
{"test_reduce_max_keepdims_example", 1, 1},
{"test_reduce_max_keepdims_random", 1, 1},
{"test_reduce_max_negative_axes_keepdims_example", 1, 1},
{"test_reduce_max_negative_axes_keepdims_random", 1, 1},
{"test_reduce_mean_default_axes_keepdims_example", 1, 1},
{"test_reduce_mean_default_axes_keepdims_random", 1, 1},
{"test_reduce_mean_do_not_keepdims_example", 1, 1},
{"test_reduce_mean_do_not_keepdims_random", 1, 1},
{"test_reduce_mean_keepdims_example", 1, 1},
{"test_reduce_mean_keepdims_random", 1, 1},
{"test_reduce_mean_negative_axes_keepdims_example", 1, 1},
{"test_reduce_mean_negative_axes_keepdims_random", 1, 1},
{"test_reduce_min_default_axes_keepdims_example", 1, 1},
{"test_reduce_min_default_axes_keepdims_random", 1, 1},
{"test_reduce_min_do_not_keepdims_example", 1, 1},
{"test_reduce_min_do_not_keepdims_random", 1, 1},
{"test_reduce_min_keepdims_example", 1, 1},
{"test_reduce_min_keepdims_random", 1, 1},
{"test_reduce_min_negative_axes_keepdims_example", 1, 1},
{"test_reduce_min_negative_axes_keepdims_random", 1, 1},
{"test_reduce_prod_default_axes_keepdims_example", 1, 1},
{"test_reduce_prod_default_axes_keepdims_random", 1, 1},
{"test_reduce_prod_do_not_keepdims_example", 1, 1},
{"test_reduce_prod_do_not_keepdims_random", 1, 1},
{"test_reduce_prod_keepdims_example", 1, 1},
{"test_reduce_prod_keepdims_random", 1, 1},
{"test_reduce_prod_negative_axes_keepdims_example", 1, 1},
{"test_reduce_prod_negative_axes_keepdims_random", 1, 1},
{"test_reduce_sum_default_axes_keepdims_example", 2, 1},
{"test_reduce_sum_default_axes_keepdims_random", 2, 1},
{"test_reduce_sum_do_not_keepdims_example", 2, 1},
{"test_reduce_sum_do_not_keepdims_random", 2, 1},
{"test_reduce_sum_empty_axes_input_noop_example", 2, 1},
{"test_reduce_sum_empty_axes_input_noop_random", 2, 1},
{"test_reduce_sum_keepdims_example", 2, 1},
{"test_reduce_sum_keepdims_random", 2, 1},
{"test_reduce_sum_negative_axes_keepdims_example", 2, 1},
{"test_reduce_sum_negative_axes_keepdims_random", 2, 1},
{"test_reduce_sum_square_default_axes_keepdims_example", 1, 1},
{"test_reduce_sum_square_default_axes_keepdims_random", 1, 1},
{"test_reduce_sum_square_do_not_keepdims_example", 1, 1},
{"test_reduce_sum_square_do_not_keepdims_random", 1, 1},
{"test_reduce_sum_square_keepdims_example", 1, 1},
{"test_reduce_sum_square_keepdims_random", 1, 1},
{"test_reduce_sum_square_negative_axes_keepdims_example", 1, 1},
{"test_reduce_sum_square_negative_axes_keepdims_random", 1, 1},
{"test_reflect_pad", 2, 1},
{"test_relu", 1, 1},
{"test_reshape_allowzero_reordered", 2, 1},
{"test_reshape_extended_dims", 2, 1},
{"test_reshape_negative_dim", 2, 1},
{"test_reshape_negative_extended_dims", 2, 1},
{"test_reshape_one_dim", 2, 1},
{"test_reshape_reduced_dims", 2, 1},
{"test_reshape_reordered_all_dims", 2, 1},
{"test_reshape_reordered_last_dims", 2, 1},
{"test_reshape_zero_and_negative_dim", 2, 1},
{"test_reshape_zero_dim", 2, 1},
{"test_resize_downsample_scales_cubic", 2, 1},
{"test_resize_downsample_scales_cubic_A_n0p5_exclude_outside", 2, 1},
{"test_resize_downsample_scales_cubic_align_corners", 2, 1},
{"test_resize_downsample_scales_linear", 2, 1},
{"test_resize_downsample_scales_linear_align_corners", 2, 1},
{"test_resize_downsample_scales_nearest", 2, 1},
{"test_resize_downsample_sizes_cubic", 2, 1},
{"test_resize_downsample_sizes_linear_pytorch_half_pixel", 2, 1},
{"test_resize_downsample_sizes_nearest", 2, 1},
{"test_resize_downsample_sizes_nearest_tf_half_pixel_for_nn", 2, 1},
{"test_resize_tf_crop_and_resize", 3, 1},
{"test_resize_upsample_scales_cubic", 2, 1},
{"test_resize_upsample_scales_cubic_A_n0p5_exclude_outside", 2, 1},
{"test_resize_upsample_scales_cubic_align_corners", 2, 1},
{"test_resize_upsample_scales_cubic_asymmetric", 2, 1},
{"test_resize_upsample_scales_linear", 2, 1},
{"test_resize_upsample_scales_linear_align_corners", 2, 1},
{"test_resize_upsample_scales_nearest", 2, 1},
{"test_resize_upsample_sizes_cubic", 2, 1},
{"test_resize_upsample_sizes_nearest", 2, 1},
{"test_resize_upsample_sizes_nearest_ceil_half_pixel", 2, 1},
{"test_resize_upsample_sizes_nearest_floor_align_corners", 2, 1},
{"test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric", 2, 1},
{"test_reversesequence_batch", 2, 1},
{"test_reversesequence_time", 2, 1},
{"test_rnn_seq_length", 4, 1},
{"test_roialign_aligned_false", 3, 1},
{"test_roialign_aligned_true", 3, 1},
{"test_round", 1, 1},
{"test_scan9_sum", 2, 2},
{"test_scan_sum", 2, 2},
{"test_scatter_elements_with_axis", 3, 1},
{"test_scatter_elements_with_duplicate_indices", 3, 1},
{"test_scatter_elements_with_negative_indices", 3, 1},
{"test_scatter_elements_with_reduction_max", 3, 1},
{"test_scatter_elements_with_reduction_min", 3, 1},
{"test_scatter_elements_without_axis", 3, 1},
{"test_scatter_with_axis", 3, 1},
{"test_scatter_without_axis", 3, 1},
{"test_scatternd", 3, 1},
{"test_scatternd_add", 3, 1},
{"test_scatternd_max", 3, 1},
{"test_scatternd_min", 3, 1},
{"test_scatternd_multiply", 3, 1},
{"test_sce_NCd1_mean_weight_negative_ii", 3, 1},
{"test_sce_NCd1_mean_weight_negative_ii_expanded", 3, 1},
{"test_sce_NCd1_mean_weight_negative_ii_log_prob", 3, 2},
{"test_sce_NCd1_mean_weight_negative_ii_log_prob_expanded", 3, 2},
{"test_sce_NCd1d2d3_none_no_weight_negative_ii", 2, 1},
{"test_sce_NCd1d2d3_none_no_weight_negative_ii_expanded", 2, 1},
{"test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob", 2, 2},
{"test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob_expanded", 2, 2},
{"test_sce_NCd1d2d3_sum_weight_high_ii", 3, 1},
{"test_sce_NCd1d2d3_sum_weight_high_ii_expanded", 3, 1},
{"test_sce_NCd1d2d3_sum_weight_high_ii_log_prob", 3, 2},
{"test_sce_NCd1d2d3_sum_weight_high_ii_log_prob_expanded", 3, 2},
{"test_sce_NCd1d2d3d4d5_mean_weight", 3, 1},
{"test_sce_NCd1d2d3d4d5_mean_weight_expanded", 3, 1},
{"test_sce_NCd1d2d3d4d5_mean_weight_log_prob", 3, 2},
{"test_sce_NCd1d2d3d4d5_mean_weight_log_prob_expanded", 3, 2},
{"test_sce_NCd1d2d3d4d5_none_no_weight", 2, 1},
{"test_sce_NCd1d2d3d4d5_none_no_weight_expanded", 2, 1},
{"test_sce_NCd1d2d3d4d5_none_no_weight_log_prob", 2, 2},
{"test_sce_NCd1d2d3d4d5_none_no_weight_log_prob_expanded", 2, 2},
{"test_sce_mean", 2, 1},
{"test_sce_mean_3d", 2, 1},
{"test_sce_mean_3d_expanded", 2, 1},
{"test_sce_mean_3d_log_prob", 2, 2},
{"test_sce_mean_3d_log_prob_expanded", 2, 2},
{"test_sce_mean_expanded", 2, 1},
{"test_sce_mean_log_prob", 2, 2},
{"test_sce_mean_log_prob_expanded", 2, 2},
{"test_sce_mean_no_weight_ii", 2, 1},
{"test_sce_mean_no_weight_ii_3d", 2, 1},
{"test_sce_mean_no_weight_ii_3d_expanded", 2, 1},
{"test_sce_mean_no_weight_ii_3d_log_prob", 2, 2},
{"test_sce_mean_no_weight_ii_3d_log_prob_expanded", 2, 2},
{"test_sce_mean_no_weight_ii_4d", 2, 1},
{"test_sce_mean_no_weight_ii_4d_expanded", 2, 1},
{"test_sce_mean_no_weight_ii_4d_log_prob", 2, 2},
{"test_sce_mean_no_weight_ii_4d_log_prob_expanded", 2, 2},
{"test_sce_mean_no_weight_ii_expanded", 2, 1},
{"test_sce_mean_no_weight_ii_log_prob", 2, 2},
{"test_sce_mean_no_weight_ii_log_prob_expanded", 2, 2},
{"test_sce_mean_weight", 3, 1},
{"test_sce_mean_weight_expanded", 3, 1},
{"test_sce_mean_weight_ii", 3, 1},
{"test_sce_mean_weight_ii_3d", 3, 1},
{"test_sce_mean_weight_ii_3d_expanded", 3, 1},
{"test_sce_mean_weight_ii_3d_log_prob", 3, 2},
{"test_sce_mean_weight_ii_3d_log_prob_expanded", 3, 2},
{"test_sce_mean_weight_ii_4d", 3, 1},
{"test_sce_mean_weight_ii_4d_expanded", 3, 1},
{"test_sce_mean_weight_ii_4d_log_prob", 3, 2},
{"test_sce_mean_weight_ii_4d_log_prob_expanded", 3, 2},
{"test_sce_mean_weight_ii_expanded", 3, 1},
{"test_sce_mean_weight_ii_log_prob", 3, 2},
{"test_sce_mean_weight_ii_log_prob_expanded", 3, 2},
{"test_sce_mean_weight_log_prob", 3, 2},
{"test_sce_mean_weight_log_prob_expanded", 3, 2},
{"test_sce_none", 2, 1},
{"test_sce_none_expanded", 2, 1},
{"test_sce_none_log_prob", 2, 2},
{"test_sce_none_log_prob_expanded", 2, 2},
{"test_sce_none_weights", 3, 1},
{"test_sce_none_weights_expanded", 3, 1},
{"test_sce_none_weights_log_prob", 3, 2},
{"test_sce_none_weights_log_prob_expanded", 3, 2},
{"test_sce_sum", 2, 1},
{"test_sce_sum_expanded", 2, 1},
{"test_sce_sum_log_prob", 2, 2},
{"test_sce_sum_log_prob_expanded", 2, 2},
{"test_selu", 1, 1},
{"test_selu_default", 1, 1},
Merge pull request #25881 from fengyuentau:dnn/cpu/optimize_activations_with_v_exp dnn: optimize activations with v_exp #25881 Merge with https://github.com/opencv/opencv_extra/pull/1191. This PR optimizes the following activations: - [x] Swish - [x] Mish - [x] Elu - [x] Celu - [x] Selu - [x] HardSwish ### Performance (Updated on 2024-07-18) #### AmLogic A311D2 (ARM Cortex A73 + A53) ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 115.859 27.930 4.15 Elu::Layer_Elementwise::OCV/CPU 27.846 27.003 1.03 Gelu::Layer_Elementwise::OCV/CPU 0.657 0.602 1.09 HardSwish::Layer_Elementwise::OCV/CPU 31.885 6.781 4.70 Mish::Layer_Elementwise::OCV/CPU 35.729 32.089 1.11 Selu::Layer_Elementwise::OCV/CPU 61.955 27.850 2.22 Swish::Layer_Elementwise::OCV/CPU 30.819 26.688 1.15 ``` #### Apple M1 ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 16.184 2.118 7.64 Celu::Layer_Elementwise::OCV/CPU_FP16 16.280 2.123 7.67 Elu::Layer_Elementwise::OCV/CPU 9.123 1.878 4.86 Elu::Layer_Elementwise::OCV/CPU_FP16 9.085 1.897 4.79 Gelu::Layer_Elementwise::OCV/CPU 0.089 0.081 1.11 Gelu::Layer_Elementwise::OCV/CPU_FP16 0.086 0.074 1.17 HardSwish::Layer_Elementwise::OCV/CPU 1.560 1.555 1.00 HardSwish::Layer_Elementwise::OCV/CPU_FP16 1.536 1.523 1.01 Mish::Layer_Elementwise::OCV/CPU 6.077 2.476 2.45 Mish::Layer_Elementwise::OCV/CPU_FP16 5.990 2.496 2.40 Selu::Layer_Elementwise::OCV/CPU 11.351 1.976 5.74 Selu::Layer_Elementwise::OCV/CPU_FP16 11.533 1.985 5.81 Swish::Layer_Elementwise::OCV/CPU 4.687 1.890 2.48 Swish::Layer_Elementwise::OCV/CPU_FP16 4.715 1.873 2.52 ``` #### Intel i7-12700K ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 17.106 3.560 4.81 Elu::Layer_Elementwise::OCV/CPU 5.064 3.478 1.46 Gelu::Layer_Elementwise::OCV/CPU 0.036 0.035 1.04 HardSwish::Layer_Elementwise::OCV/CPU 2.914 2.893 1.01 Mish::Layer_Elementwise::OCV/CPU 3.820 3.529 1.08 Selu::Layer_Elementwise::OCV/CPU 10.799 3.593 3.01 Swish::Layer_Elementwise::OCV/CPU 3.651 3.473 1.05 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2024-07-19 21:03:19 +08:00
{"test_selu_default_expanded_ver18", 1, 1},
{"test_selu_example", 1, 1},
Merge pull request #25881 from fengyuentau:dnn/cpu/optimize_activations_with_v_exp dnn: optimize activations with v_exp #25881 Merge with https://github.com/opencv/opencv_extra/pull/1191. This PR optimizes the following activations: - [x] Swish - [x] Mish - [x] Elu - [x] Celu - [x] Selu - [x] HardSwish ### Performance (Updated on 2024-07-18) #### AmLogic A311D2 (ARM Cortex A73 + A53) ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 115.859 27.930 4.15 Elu::Layer_Elementwise::OCV/CPU 27.846 27.003 1.03 Gelu::Layer_Elementwise::OCV/CPU 0.657 0.602 1.09 HardSwish::Layer_Elementwise::OCV/CPU 31.885 6.781 4.70 Mish::Layer_Elementwise::OCV/CPU 35.729 32.089 1.11 Selu::Layer_Elementwise::OCV/CPU 61.955 27.850 2.22 Swish::Layer_Elementwise::OCV/CPU 30.819 26.688 1.15 ``` #### Apple M1 ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 16.184 2.118 7.64 Celu::Layer_Elementwise::OCV/CPU_FP16 16.280 2.123 7.67 Elu::Layer_Elementwise::OCV/CPU 9.123 1.878 4.86 Elu::Layer_Elementwise::OCV/CPU_FP16 9.085 1.897 4.79 Gelu::Layer_Elementwise::OCV/CPU 0.089 0.081 1.11 Gelu::Layer_Elementwise::OCV/CPU_FP16 0.086 0.074 1.17 HardSwish::Layer_Elementwise::OCV/CPU 1.560 1.555 1.00 HardSwish::Layer_Elementwise::OCV/CPU_FP16 1.536 1.523 1.01 Mish::Layer_Elementwise::OCV/CPU 6.077 2.476 2.45 Mish::Layer_Elementwise::OCV/CPU_FP16 5.990 2.496 2.40 Selu::Layer_Elementwise::OCV/CPU 11.351 1.976 5.74 Selu::Layer_Elementwise::OCV/CPU_FP16 11.533 1.985 5.81 Swish::Layer_Elementwise::OCV/CPU 4.687 1.890 2.48 Swish::Layer_Elementwise::OCV/CPU_FP16 4.715 1.873 2.52 ``` #### Intel i7-12700K ``` Geometric mean (ms) Name of Test activations activations.patch activations.patch vs activations (x-factor) Celu::Layer_Elementwise::OCV/CPU 17.106 3.560 4.81 Elu::Layer_Elementwise::OCV/CPU 5.064 3.478 1.46 Gelu::Layer_Elementwise::OCV/CPU 0.036 0.035 1.04 HardSwish::Layer_Elementwise::OCV/CPU 2.914 2.893 1.01 Mish::Layer_Elementwise::OCV/CPU 3.820 3.529 1.08 Selu::Layer_Elementwise::OCV/CPU 10.799 3.593 3.01 Swish::Layer_Elementwise::OCV/CPU 3.651 3.473 1.05 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2024-07-19 21:03:19 +08:00
{"test_selu_example_expanded_ver18", 1, 1},
{"test_selu_expanded_ver18", 1, 1},
{"test_sequence_insert_at_back", 2, 1},
{"test_sequence_insert_at_front", 3, 1},
{"test_shape", 1, 1},
{"test_shape_clip_end", 1, 1},
{"test_shape_clip_start", 1, 1},
{"test_shape_end_1", 1, 1},
{"test_shape_end_negative_1", 1, 1},
{"test_shape_example", 1, 1},
{"test_shape_start_1", 1, 1},
{"test_shape_start_1_end_2", 1, 1},
{"test_shape_start_1_end_negative_1", 1, 1},
{"test_shape_start_negative_1", 1, 1},
{"test_shrink_hard", 1, 1},
{"test_shrink_soft", 1, 1},
{"test_sigmoid", 1, 1},
{"test_sigmoid_example", 1, 1},
{"test_sign", 1, 1},
{"test_simple_rnn_batchwise", 3, 2},
{"test_simple_rnn_defaults", 3, 1},
{"test_simple_rnn_with_initial_bias", 4, 1},
{"test_sin", 1, 1},
{"test_sin_example", 1, 1},
{"test_sinh", 1, 1},
{"test_sinh_example", 1, 1},
{"test_size", 1, 1},
{"test_size_example", 1, 1},
{"test_slice", 5, 1},
{"test_slice_default_axes", 3, 1},
{"test_slice_default_steps", 4, 1},
{"test_slice_end_out_of_bounds", 5, 1},
{"test_slice_neg", 5, 1},
{"test_slice_neg_steps", 5, 1},
{"test_slice_negative_axes", 4, 1},
{"test_slice_start_out_of_bounds", 5, 1},
{"test_softmax_axis_0", 1, 1},
{"test_softmax_axis_0_expanded", 1, 1},
{"test_softmax_axis_1", 1, 1},
{"test_softmax_axis_1_expanded", 1, 1},
{"test_softmax_axis_2", 1, 1},
{"test_softmax_axis_2_expanded", 1, 1},
{"test_softmax_default_axis", 1, 1},
{"test_softmax_default_axis_expanded", 1, 1},
{"test_softmax_example", 1, 1},
{"test_softmax_example_expanded", 1, 1},
{"test_softmax_large_number", 1, 1},
{"test_softmax_large_number_expanded", 1, 1},
{"test_softmax_negative_axis", 1, 1},
{"test_softmax_negative_axis_expanded", 1, 1},
{"test_softplus", 1, 1},
{"test_softplus_example", 1, 1},
{"test_softsign", 1, 1},
{"test_softsign_example", 1, 1},
{"test_spacetodepth", 1, 1},
{"test_spacetodepth_example", 1, 1},
{"test_split_equal_parts_1d", 1, 3},
{"test_split_equal_parts_2d", 1, 2},
{"test_split_equal_parts_default_axis", 1, 3},
{"test_split_variable_parts_1d", 2, 2},
{"test_split_variable_parts_2d", 2, 2},
{"test_split_variable_parts_default_axis", 2, 2},
{"test_split_zero_size_splits", 2, 3},
{"test_sqrt", 1, 1},
{"test_sqrt_example", 1, 1},
{"test_squeeze", 2, 1},
{"test_squeeze_negative_axes", 2, 1},
{"test_strnormalizer_export_monday_casesensintive_lower", 1, 1},
{"test_strnormalizer_export_monday_casesensintive_nochangecase", 1, 1},
{"test_strnormalizer_export_monday_casesensintive_upper", 1, 1},
{"test_strnormalizer_export_monday_empty_output", 1, 1},
{"test_strnormalizer_export_monday_insensintive_upper_twodim", 1, 1},
{"test_strnormalizer_nostopwords_nochangecase", 1, 1},
{"test_sub", 2, 1},
{"test_sub_bcast", 2, 1},
{"test_sub_example", 2, 1},
{"test_sub_uint8", 2, 1},
{"test_sum_example", 3, 1},
{"test_sum_one_input", 1, 1},
{"test_sum_two_inputs", 2, 1},
{"test_tan", 1, 1},
{"test_tan_example", 1, 1},
{"test_tanh", 1, 1},
{"test_tanh_example", 1, 1},
{"test_tfidfvectorizer_tf_batch_onlybigrams_skip0", 1, 1},
{"test_tfidfvectorizer_tf_batch_onlybigrams_skip5", 1, 1},
{"test_tfidfvectorizer_tf_batch_uniandbigrams_skip5", 1, 1},
{"test_tfidfvectorizer_tf_only_bigrams_skip0", 1, 1},
{"test_tfidfvectorizer_tf_onlybigrams_levelempty", 1, 1},
{"test_tfidfvectorizer_tf_onlybigrams_skip5", 1, 1},
{"test_tfidfvectorizer_tf_uniandbigrams_skip5", 1, 1},
{"test_thresholdedrelu", 1, 1},
{"test_thresholdedrelu_default", 1, 1},
{"test_thresholdedrelu_example", 1, 1},
{"test_tile", 2, 1},
{"test_tile_precomputed", 2, 1},
{"test_top_k", 2, 2},
{"test_top_k_negative_axis", 2, 2},
{"test_top_k_smallest", 2, 2},
{"test_training_dropout", 3, 1},
{"test_training_dropout_default", 3, 1},
{"test_training_dropout_default_mask", 3, 2},
{"test_training_dropout_mask", 3, 2},
{"test_training_dropout_zero_ratio", 3, 1},
{"test_training_dropout_zero_ratio_mask", 3, 2},
{"test_transpose_all_permutations_0", 1, 1},
{"test_transpose_all_permutations_1", 1, 1},
{"test_transpose_all_permutations_2", 1, 1},
{"test_transpose_all_permutations_3", 1, 1},
{"test_transpose_all_permutations_4", 1, 1},
{"test_transpose_all_permutations_5", 1, 1},
{"test_transpose_default", 1, 1},
{"test_tril", 1, 1},
{"test_tril_neg", 2, 1},
{"test_tril_one_row_neg", 1, 1},
{"test_tril_out_neg", 2, 1},
{"test_tril_out_pos", 2, 1},
{"test_tril_pos", 2, 1},
{"test_tril_square", 1, 1},
{"test_tril_square_neg", 2, 1},
{"test_tril_zero", 2, 1},
{"test_triu", 1, 1},
{"test_triu_neg", 2, 1},
{"test_triu_one_row", 2, 1},
{"test_triu_out_neg_out", 2, 1},
{"test_triu_out_pos", 2, 1},
{"test_triu_pos", 2, 1},
{"test_triu_square", 1, 1},
{"test_triu_square_neg", 2, 1},
{"test_triu_zero", 2, 1},
{"test_unique_not_sorted_without_axis", 1, 4},
{"test_unique_sorted_with_axis", 1, 4},
{"test_unique_sorted_with_axis_3d", 1, 4},
{"test_unique_sorted_with_negative_axis", 1, 4},
{"test_unique_sorted_without_axis", 1, 4},
{"test_unsqueeze_axis_0", 2, 1},
{"test_unsqueeze_axis_1", 2, 1},
{"test_unsqueeze_axis_2", 2, 1},
{"test_unsqueeze_axis_3", 1, 1},
{"test_unsqueeze_negative_axes", 2, 1},
{"test_unsqueeze_three_axes", 2, 1},
{"test_unsqueeze_two_axes", 2, 1},
{"test_unsqueeze_unsorted_axes", 2, 1},
{"test_upsample_nearest", 2, 1},
{"test_where_example", 3, 1},
{"test_where_long_example", 3, 1},
{"test_xor2d", 2, 1},
{"test_xor3d", 2, 1},
{"test_xor4d", 2, 1},
{"test_xor_bcast3v1d", 2, 1},
{"test_xor_bcast3v2d", 2, 1},
{"test_xor_bcast4v2d", 2, 1},
{"test_xor_bcast4v3d", 2, 1},
{"test_xor_bcast4v4d", 2, 1},
};
std::ostream& operator<<(std::ostream& os, const TestCase& test_case)
{
return os << test_case.name;
}
typedef tuple<TestCase, tuple<Backend, Target> > ONNXConfParams;
std::string printOnnxConfParams(const testing::TestParamInfo<ONNXConfParams>& params)
{
TestCase test_case = get<0>(params.param);
Backend backend = get<0>(get<1>(params.param));
Target target = get<1>(get<1>(params.param));
std::stringstream ss;
ss << test_case.name << "_";
PrintTo(backend, &ss);
ss << "_";
PrintTo(target, &ss);
return ss.str();
}
class Test_ONNX_conformance : public TestWithParam<ONNXConfParams>
{
public:
TestCase test_case;
Backend backend;
Target target;
double default_l1;
double default_lInf;
static std::set<std::string> parser_deny_list;
static std::set<std::string> global_deny_list;
static std::set<std::string> opencv_deny_list;
static std::set<std::string> opencl_fp16_deny_list;
static std::set<std::string> opencl_deny_list;
static std::set<std::string> cpu_deny_list;
#ifdef HAVE_HALIDE
static std::set<std::string> halide_deny_list;
#endif
#ifdef HAVE_VULKAN
static std::set<std::string> vulkan_deny_list;
#endif
#ifdef HAVE_CUDA
static std::set<std::string> cuda_deny_list;
Merge pull request #25630 from fengyuentau:nary-multi-thread dnn: parallelize nary elementwise forward implementation & enable related conformance tests #25630 This PR introduces the following changes: - [x] Parallelize binary forward impl - [x] Parallelize ternary forward impl (Where) - [x] Parallelize nary (Operator that can take >=1 operands) - [x] Enable conformance tests if workable ## Performance ### i7-12700K, RAM 64GB, Ubuntu 22.04 ``` Geometric mean (ms) Name of Test opencv opencv opencv perf perf perf core.x64.0606 core.x64.0606 core.x64.0606 vs opencv perf core.x64.0606 (x-factor) NCHW_C_sum::Layer_NaryEltwise::OCV/CPU 16.116 11.161 1.44 NCHW_NCHW_add::Layer_NaryEltwise::OCV/CPU 17.469 11.446 1.53 NCHW_NCHW_div::Layer_NaryEltwise::OCV/CPU 17.531 11.469 1.53 NCHW_NCHW_equal::Layer_NaryEltwise::OCV/CPU 28.653 13.682 2.09 NCHW_NCHW_greater::Layer_NaryEltwise::OCV/CPU 21.899 13.422 1.63 NCHW_NCHW_less::Layer_NaryEltwise::OCV/CPU 21.738 13.185 1.65 NCHW_NCHW_max::Layer_NaryEltwise::OCV/CPU 16.172 11.473 1.41 NCHW_NCHW_mean::Layer_NaryEltwise::OCV/CPU 16.309 11.565 1.41 NCHW_NCHW_min::Layer_NaryEltwise::OCV/CPU 16.166 11.454 1.41 NCHW_NCHW_mul::Layer_NaryEltwise::OCV/CPU 16.157 11.443 1.41 NCHW_NCHW_pow::Layer_NaryEltwise::OCV/CPU 163.459 15.234 10.73 NCHW_NCHW_ref_div::Layer_NaryEltwise::OCV/CPU 10.880 10.868 1.00 NCHW_NCHW_ref_max::Layer_NaryEltwise::OCV/CPU 10.947 11.058 0.99 NCHW_NCHW_ref_min::Layer_NaryEltwise::OCV/CPU 10.948 10.910 1.00 NCHW_NCHW_ref_mul::Layer_NaryEltwise::OCV/CPU 10.874 10.871 1.00 NCHW_NCHW_ref_sum::Layer_NaryEltwise::OCV/CPU 10.971 10.920 1.00 NCHW_NCHW_sub::Layer_NaryEltwise::OCV/CPU 17.546 11.462 1.53 NCHW_NCHW_sum::Layer_NaryEltwise::OCV/CPU 16.175 11.475 1.41 NHWC_C::Layer_NaryEltwise::OCV/CPU 11.339 11.333 1.00 NHWC_H::Layer_NaryEltwise::OCV/CPU 16.154 11.102 1.46 ``` ### Apple M1, RAM 16GB, macOS 14.4.1 ``` Geometric mean (ms) Name of Test opencv opencv opencv perf perf perf core.m1.0606 core.m1.0606.patch core.m1.0606.patch vs opencv perf core.m1.0606 (x-factor) NCHW_C_sum::Layer_NaryEltwise::OCV/CPU 28.418 3.768 7.54 NCHW_NCHW_add::Layer_NaryEltwise::OCV/CPU 6.942 5.679 1.22 NCHW_NCHW_div::Layer_NaryEltwise::OCV/CPU 5.822 5.653 1.03 NCHW_NCHW_equal::Layer_NaryEltwise::OCV/CPU 5.751 5.628 1.02 NCHW_NCHW_greater::Layer_NaryEltwise::OCV/CPU 5.797 5.599 1.04 NCHW_NCHW_less::Layer_NaryEltwise::OCV/CPU 7.272 5.578 1.30 NCHW_NCHW_max::Layer_NaryEltwise::OCV/CPU 5.777 5.562 1.04 NCHW_NCHW_mean::Layer_NaryEltwise::OCV/CPU 5.819 5.559 1.05 NCHW_NCHW_min::Layer_NaryEltwise::OCV/CPU 5.830 5.574 1.05 NCHW_NCHW_mul::Layer_NaryEltwise::OCV/CPU 5.759 5.567 1.03 NCHW_NCHW_pow::Layer_NaryEltwise::OCV/CPU 342.260 74.655 4.58 NCHW_NCHW_ref_div::Layer_NaryEltwise::OCV/CPU 8.338 8.280 1.01 NCHW_NCHW_ref_max::Layer_NaryEltwise::OCV/CPU 8.359 8.309 1.01 NCHW_NCHW_ref_min::Layer_NaryEltwise::OCV/CPU 8.412 8.295 1.01 NCHW_NCHW_ref_mul::Layer_NaryEltwise::OCV/CPU 8.380 8.297 1.01 NCHW_NCHW_ref_sum::Layer_NaryEltwise::OCV/CPU 8.356 8.323 1.00 NCHW_NCHW_sub::Layer_NaryEltwise::OCV/CPU 6.818 5.561 1.23 NCHW_NCHW_sum::Layer_NaryEltwise::OCV/CPU 5.805 5.570 1.04 NHWC_C::Layer_NaryEltwise::OCV/CPU 3.834 4.817 0.80 NHWC_H::Layer_NaryEltwise::OCV/CPU 28.402 3.771 7.53 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2024-07-03 15:09:05 +08:00
static std::set<std::string> cuda_fp16_deny_list;
#endif
Test_ONNX_conformance()
{
test_case = get<0>(GetParam());
backend = get<0>(get<1>(GetParam()));
target = get<1>(get<1>(GetParam()));
Merge pull request #22275 from zihaomu:fp16_support_conv DNN: FP16 support on Convolution 2D #22275 ## FP16 support on ARM platform This PR proposes to support FP16 backend in Convolution. For now, we only support FP16 at ARM aarch64. In addition to adding fp16, I also added `seperateIm2col` optimization in this patch. ## How to use FP16 to speed up convolution? ``` Net net = readNet(modelPath); net.setPreferableTarget(DNN_TARGET_CPU_FP16); net.setInput(blob); Mat output = net.forward(); ``` ### TODO List | Task | Status | Remarks | |:-------:|:--------:|:------------:| | Convolution 2D FP16 | :heavy_check_mark: | Done | | Winograd FP16 | Because the current modification has reached 2k lines, winograd fp16 will be completed in the next PR. | | | Accuracy Test | :heavy_check_mark: | Done | | Performance Test | :heavy_check_mark: | Done | | Compiler bug | :heavy_check_mark: | Done | ### Speed Test for FP 16. **Test on M1 chip, 4 threads.** | Model Name | FP32 (Conv+Wino) | Conv(FP16) + Wino(FP 32) | |:-------:|:--------:|:------------:| | ReseNet 50 | 26.0 ms | **18.05 ms** (25% speed up)| | MobileNet V2 | 4.17 ms | **3.09 ms (29% speed up)** | ### Speed Test for `seperateIm2col` trick on X86. **Test on AMD 5600x, 12 threads.** | Model Name | 4.x | Patch | |:-------:|:--------:|:------------:| | MobileNet V2 | 5.6 ms | **3.0 ms (46% speed up)** | ### Performance Test #### Performance Test of X86 platform: AMD 5600X, with `-perf_threas=1` |Name of Test|4.x|patch|patch vs 4.x (x-factor)| |---|:-:|:-:|:-:| |Name of Test|4.x 0|fp16pr final|fp16pr final vs 4.x 0 (x-factor)| |---|:-:|:-:|:-:| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 19}, OCN=2, G=2, S=2, P=(1, 1), BIAS, OCV/CPU)|0.001|0.001|1.00| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 25}, OCN=2, G=2, P=(2, 2), PM=SAME, OCV/CPU)|0.001|0.001|1.03| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 6, 10}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.001|0.001|0.92| |conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 4, 9, 10, 10}, OCN=4, S=[1 x 1 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.002|0.003|0.95| |conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 8, 1, 10, 10}, OCN=8, G=8, P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.006|0.006|1.00| |conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 3 x 3], IN={1, 2, 19, 19, 19}, OCN=2, G=2, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.045|0.033|1.39| |conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 4 x 2], IN={1, 4, 8, 10, 10}, OCN=4, G=4, S=[1 x 2 x 1], BIAS, OCV/CPU)|0.011|0.009|1.17| |conv3d::Conv3D::(GFLOPS=0.001, K=[3 x 3 x 3], IN={1, 2, 25, 19, 19}, OCN=2, G=2, S=[1 x 2 x 2], P=(2, 2) x (2, 2) x (2, 2), PM=SAME, OCV/CPU)|0.109|0.078|1.39| |conv3d::Conv3D::(GFLOPS=0.002, K=[3 x 1 x 4], IN={1, 14, 5, 10, 10}, OCN=14, PM=SAME, OCV/CPU)|0.040|0.042|0.94| |conv3d::Conv3D::(GFLOPS=0.006, K=[5 x 5 x 5], IN={1, 4, 50, 19, 19}, OCN=4, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.326|0.342|0.95| |conv3d::Conv3D::(GFLOPS=0.027, K=[3 x 3 x 3], IN={1, 6, 10, 38, 50}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.580|0.589|0.99| |conv3d::Conv3D::(GFLOPS=0.030, K=[5 x 5 x 5], IN={1, 6, 19, 19, 19}, OCN=6, G=2, OCV/CPU)|1.293|1.382|0.94| |conv3d::Conv3D::(GFLOPS=0.045, K=[7 x 7 x 7], IN={1, 2, 38, 38, 38}, OCN=2, S=[1 x 2 x 1], OCV/CPU)|3.590|3.710|0.97| |conv3d::Conv3D::(GFLOPS=0.053, K=[3 x 3 x 3], IN={1, 10, 98, 10, 10}, OCN=10, PM=SAME, OCV/CPU)|1.120|1.191|0.94| |conv3d::Conv3D::(GFLOPS=0.071, K=[7 x 7 x 7], IN={1, 6, 15, 19, 19}, OCN=6, S=[2 x 1 x 1], P=(3, 3) x (3, 3) x (3, 3), PM=SAME, BIAS, OCV/CPU)|2.576|2.872|0.90| |conv3d::Conv3D::(GFLOPS=0.093, K=[5 x 5 x 5], IN={1, 4, 40, 75, 75}, OCN=4, S=[2 x 2 x 2], OCV/CPU)|4.599|4.670|0.98| |conv3d::Conv3D::(GFLOPS=0.116, K=[5 x 5 x 5], IN={1, 2, 21, 75, 100}, OCN=2, BIAS, OCV/CPU)|9.230|9.582|0.96| |conv3d::Conv3D::(GFLOPS=1.267, K=[5 x 5 x 5], IN={1, 3, 75, 75, 100}, OCN=3, PM=SAME, BIAS, OCV/CPU)|65.946|69.381|0.95| |conv3d::Conv3D::(GFLOPS=1.343, K=[3 x 3 x 3], IN={1, 11, 9, 150, 200}, OCN=11, PM=VALID, BIAS, OCV/CPU)|18.915|19.289|0.98| |conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU)|1.404|1.457|0.96| |conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU)|2.060|1.501|1.37| |conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU)|1.409|1.464|0.96| |conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU)|1.793|1.838|0.98| |conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU)|1.207|1.199|1.01| |conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU)|1.277|1.275|1.00| |conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|2.319|2.370|0.98| |conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU)|1.351|1.346|1.00| |conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU)|3.520|3.612|0.97| |conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.876|1.880|1.00| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU)|1.981|1.995|0.99| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU)|2.620|2.627|1.00| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU)|4.202|4.123|1.02| |conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|2.429|2.445|0.99| |conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|2.591|2.576|1.01| |conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU)|3.005|2.998|1.00| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU)|3.515|3.532|1.00| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU)|3.115|3.134|0.99| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|3.937|3.899|1.01| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU)|5.533|5.471|1.01| |conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|3.472|3.464|1.00| |conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU)|4.302|4.322|1.00| |conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU)|6.100|6.035|1.01| |conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU)|6.580|6.484|1.01| |conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|9.741|9.634|1.01| |conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|10.131|10.156|1.00| |conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|12.391|12.350|1.00| |conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU)|91.074|87.893|1.04| |conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|5.903|5.903|1.00| |conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU)|6.890|6.794|1.01| |conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU)|5.160|5.131|1.01| |conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|4.970|5.036|0.99| |conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|5.045|5.015|1.01| |conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU)|11.583|11.343|1.02| |conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU)|5.348|5.320|1.01| |conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU)|5.357|5.396|0.99| |conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU)|6.050|6.006|1.01| |conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU)|5.952|5.953|1.00| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU)|8.014|8.014|1.00| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU)|12.472|12.577|0.99| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU)|10.803|10.655|1.01| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU)|18.429|13.405|1.37| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU)|6.659|6.647|1.00| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU)|14.192|13.819|1.03| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU)|6.045|6.068|1.00| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU)|12.742|12.828|0.99| |conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU)|8.046|7.773|1.04| |conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU)|17.440|17.192|1.01| |conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU)|15.418|14.972|1.03| |conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU)|0.430|0.430|1.00| |conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU)|6.692|6.663|1.00| |conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU)|6.350|6.347|1.00| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU)|0.267|0.265|1.01| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU)|7.755|7.558|1.03| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU)|0.203|0.202|1.00| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|10.663|10.576|1.01| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU)|10.827|10.614|1.02| |conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU)|7.049|6.947|1.01| |conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU)|6.900|6.901|1.00| |conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU)|0.165|0.165|1.00| |conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU)|17.953|17.251|1.04| |conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU)|7.430|7.320|1.01| |conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU)|22.187|21.705|1.02| |conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU)|8.349|8.126|1.03| |conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU)|8.273|8.297|1.00| |conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU)|8.169|8.094|1.01| |conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU)|13.602|13.359|1.02| |conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU)|8.633|8.584|1.01| |conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU)|29.339|28.897|1.02| |conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|13.000|12.920|1.01| |conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|14.262|13.319|1.07| |conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|27.453|27.253|1.01| |conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU)|32.052|27.269|1.18| |conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|15.363|15.208|1.01| |conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|18.543|18.434|1.01| |conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU)|39.114|37.954|1.03| |conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|36.271|36.972|0.98| |conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|19.262|19.427|0.99| |conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|19.298|19.349|1.00| |conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|20.261|19.847|1.02| |conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|21.867|21.525|1.02| |conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU)|51.756|49.979|1.04| |conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|28.133|27.060|1.04| |conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU)|25.035|24.980|1.00| |conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|25.858|25.821|1.00| |conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|27.313|27.149|1.01| |conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU)|28.219|28.111|1.00| |conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|46.025|46.674|0.99| |conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|30.220|29.446|1.03| |conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|49.410|48.708|1.01| |conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|38.203|38.001|1.01| |conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|39.961|39.021|1.02| |conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|48.685|47.075|1.03| |conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|75.114|72.586|1.03| |conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU)|41.222|41.144|1.00| |conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU)|46.220|46.353|1.00| |conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU)|98.201|98.771|0.99| |conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|100.106|96.971|1.03| |conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|146.977|140.445|1.05| |conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU)|198.618|194.665|1.02| #### Performance Test of ARM platform: apple M1, with `-perf_threas=1` Min (ms) |Name of Test|4.x|patch|4.x vs patch (x-factor)| |---|:-:|:-:|:-:| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 19}, OCN=2, G=2, S=2, P=(1, 1), BIAS, OCV/CPU)|0.001|0.001|1.07| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 25}, OCN=2, G=2, P=(2, 2), PM=SAME, OCV/CPU)|0.001|0.001|1.10| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 6, 10}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.002|0.002|0.97| |conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 4, 9, 10, 10}, OCN=4, S=[1 x 1 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.003|0.003|0.84| |conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 8, 1, 10, 10}, OCN=8, G=8, P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.009|0.009|1.00| |conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 3 x 3], IN={1, 2, 19, 19, 19}, OCN=2, G=2, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.027|0.030|0.90| |conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 4 x 2], IN={1, 4, 8, 10, 10}, OCN=4, G=4, S=[1 x 2 x 1], BIAS, OCV/CPU)|0.008|0.007|1.07| |conv3d::Conv3D::(GFLOPS=0.001, K=[3 x 3 x 3], IN={1, 2, 25, 19, 19}, OCN=2, G=2, S=[1 x 2 x 2], P=(2, 2) x (2, 2) x (2, 2), PM=SAME, OCV/CPU)|0.066|0.072|0.91| |conv3d::Conv3D::(GFLOPS=0.002, K=[3 x 1 x 4], IN={1, 14, 5, 10, 10}, OCN=14, PM=SAME, OCV/CPU)|0.090|0.054|1.68| |conv3d::Conv3D::(GFLOPS=0.006, K=[5 x 5 x 5], IN={1, 4, 50, 19, 19}, OCN=4, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.328|0.409|0.80| |conv3d::Conv3D::(GFLOPS=0.027, K=[3 x 3 x 3], IN={1, 6, 10, 38, 50}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.659|0.697|0.95| |conv3d::Conv3D::(GFLOPS=0.030, K=[5 x 5 x 5], IN={1, 6, 19, 19, 19}, OCN=6, G=2, OCV/CPU)|1.266|1.403|0.90| |conv3d::Conv3D::(GFLOPS=0.045, K=[7 x 7 x 7], IN={1, 2, 38, 38, 38}, OCN=2, S=[1 x 2 x 1], OCV/CPU)|3.550|4.145|0.86| |conv3d::Conv3D::(GFLOPS=0.053, K=[3 x 3 x 3], IN={1, 10, 98, 10, 10}, OCN=10, PM=SAME, OCV/CPU)|1.188|1.375|0.86| |conv3d::Conv3D::(GFLOPS=0.071, K=[7 x 7 x 7], IN={1, 6, 15, 19, 19}, OCN=6, S=[2 x 1 x 1], P=(3, 3) x (3, 3) x (3, 3), PM=SAME, BIAS, OCV/CPU)|2.683|3.236|0.83| |conv3d::Conv3D::(GFLOPS=0.093, K=[5 x 5 x 5], IN={1, 4, 40, 75, 75}, OCN=4, S=[2 x 2 x 2], OCV/CPU)|4.491|5.501|0.82| |conv3d::Conv3D::(GFLOPS=0.116, K=[5 x 5 x 5], IN={1, 2, 21, 75, 100}, OCN=2, BIAS, OCV/CPU)|8.916|10.181|0.88| |conv3d::Conv3D::(GFLOPS=1.267, K=[5 x 5 x 5], IN={1, 3, 75, 75, 100}, OCN=3, PM=SAME, BIAS, OCV/CPU)|69.995|72.296|0.97| |conv3d::Conv3D::(GFLOPS=1.343, K=[3 x 3 x 3], IN={1, 11, 9, 150, 200}, OCN=11, PM=VALID, BIAS, OCV/CPU)|22.531|23.139|0.97| |conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU)|2.239|1.933|1.16| |conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU_FP16)|-|1.010|-| |conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU)|3.134|2.068|1.52| |conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU_FP16)|-|1.062|-| |conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU)|1.918|1.920|1.00| |conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU_FP16)|-|1.014|-| |conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU)|2.340|2.352|0.99| |conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU_FP16)|-|1.247|-| |conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU)|1.116|1.111|1.00| |conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU_FP16)|-|1.114|-| |conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU)|1.116|1.112|1.00| |conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU_FP16)|-|1.113|-| |conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|3.067|3.085|0.99| |conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU_FP16)|-|1.622|-| |conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU)|1.153|1.187|0.97| |conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU_FP16)|-|1.150|-| |conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU)|4.804|4.849|0.99| |conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU_FP16)|-|2.922|-| |conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.463|1.469|1.00| |conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU_FP16)|-|1.459|-| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU)|1.577|1.580|1.00| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU_FP16)|-|1.580|-| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU)|1.826|1.818|1.00| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU_FP16)|-|1.817|-| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU)|6.541|5.081|1.29| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU_FP16)|-|2.809|-| |conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.912|1.919|1.00| |conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU_FP16)|-|1.919|-| |conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|1.961|1.971|0.99| |conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|1.961|-| |conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU)|2.317|2.329|0.99| |conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU_FP16)|-|2.322|-| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU)|2.920|2.947|0.99| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU_FP16)|-|2.924|-| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU)|2.467|2.466|1.00| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU_FP16)|-|2.496|-| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|3.028|2.997|1.01| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU_FP16)|-|2.986|-| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU)|4.353|4.355|1.00| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU_FP16)|-|4.355|-| |conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|2.762|2.793|0.99| |conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU_FP16)|-|2.797|-| |conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU)|3.428|3.226|1.06| |conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU_FP16)|-|3.223|-| |conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU)|3.967|3.957|1.00| |conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU_FP16)|-|3.960|-| |conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU)|4.806|4.387|1.10| |conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU_FP16)|-|4.366|-| |conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|14.509|11.756|1.23| |conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|6.510|-| |conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|13.718|13.287|1.03| |conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU_FP16)|-|7.190|-| |conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|15.133|14.853|1.02| |conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU_FP16)|-|8.671|-| |conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU)|41.928|43.328|0.97| |conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU_FP16)|-|38.072|-| |conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|4.409|4.428|1.00| |conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU_FP16)|-|4.427|-| |conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU)|6.144|5.363|1.15| |conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU_FP16)|-|5.368|-| |conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU)|3.926|3.932|1.00| |conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU_FP16)|-|3.938|-| |conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|3.920|3.915|1.00| |conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU_FP16)|-|3.950|-| |conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|3.767|3.764|1.00| |conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|3.762|-| |conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU)|19.959|13.875|1.44| |conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU_FP16)|-|7.781|-| |conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU)|3.951|3.955|1.00| |conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU_FP16)|-|3.969|-| |conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU)|4.050|4.034|1.00| |conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU_FP16)|-|4.093|-| |conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU)|4.923|4.506|1.09| |conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU_FP16)|-|4.509|-| |conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU)|4.759|4.476|1.06| |conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU_FP16)|-|4.447|-| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU)|6.079|5.628|1.08| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU_FP16)|-|5.625|-| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU)|19.843|17.523|1.13| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|8.917|-| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU)|8.334|8.247|1.01| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU_FP16)|-|8.246|-| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU)|23.164|18.199|1.27| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|9.305|-| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU)|5.184|5.178|1.00| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU_FP16)|-|5.149|-| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU)|17.990|18.103|0.99| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|9.777|-| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU)|4.831|4.522|1.07| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU_FP16)|-|4.523|-| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU)|17.328|17.319|1.00| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|8.948|-| |conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU)|5.944|5.961|1.00| |conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU_FP16)|-|5.936|-| |conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU)|19.811|20.064|0.99| |conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|11.705|-| |conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU)|22.398|17.686|1.27| |conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU_FP16)|-|9.859|-| |conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU)|0.416|0.416|1.00| |conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|0.417|-| |conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU)|5.356|5.110|1.05| |conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU_FP16)|-|5.114|-| |conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU)|5.092|4.748|1.07| |conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU_FP16)|-|4.754|-| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU)|0.260|0.229|1.13| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|0.229|-| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU)|5.872|5.460|1.08| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU_FP16)|-|5.460|-| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU)|0.161|0.161|1.00| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|0.161|-| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|7.176|7.175|1.00| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|7.162|-| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU)|7.174|7.185|1.00| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU_FP16)|-|7.157|-| |conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU)|5.400|5.180|1.04| |conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU_FP16)|-|5.201|-| |conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU)|5.330|5.188|1.03| |conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU_FP16)|-|5.177|-| |conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU)|0.115|0.115|1.00| |conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|0.115|-| |conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU)|26.156|20.222|1.29| |conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU_FP16)|-|11.203|-| |conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU)|5.627|5.543|1.02| |conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU_FP16)|-|5.506|-| |conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU)|27.925|27.741|1.01| |conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU_FP16)|-|17.217|-| |conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU)|6.359|6.062|1.05| |conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU_FP16)|-|6.048|-| |conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU)|6.559|6.322|1.04| |conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU_FP16)|-|6.280|-| |conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU)|6.412|6.200|1.03| |conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU_FP16)|-|6.197|-| |conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU)|9.167|8.624|1.06| |conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU_FP16)|-|8.626|-| |conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU)|6.755|6.491|1.04| |conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU_FP16)|-|6.520|-| |conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU)|35.664|34.752|1.03| |conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU_FP16)|-|20.260|-| |conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|9.514|9.414|1.01| |conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|9.462|-| |conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|10.631|9.963|1.07| |conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|9.935|-| |conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|37.465|36.798|1.02| |conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU_FP16)|-|19.569|-| |conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU)|38.157|36.157|1.06| |conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU_FP16)|-|18.902|-| |conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|10.356|10.401|1.00| |conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|10.360|-| |conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|12.641|12.150|1.04| |conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU_FP16)|-|12.162|-| |conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU)|50.545|50.505|1.00| |conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU_FP16)|-|27.950|-| |conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|54.233|49.603|1.09| |conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU_FP16)|-|26.515|-| |conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|13.779|12.968|1.06| |conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|12.984|-| |conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|15.809|15.329|1.03| |conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|15.433|-| |conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|14.563|14.527|1.00| |conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|14.480|-| |conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|16.714|16.484|1.01| |conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|16.362|-| |conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU)|77.832|65.729|1.18| |conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU_FP16)|-|32.065|-| |conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|21.903|20.386|1.07| |conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU_FP16)|-|20.416|-| |conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU)|20.405|18.148|1.12| |conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU_FP16)|-|18.128|-| |conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|20.334|18.521|1.10| |conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|18.495|-| |conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|21.527|19.584|1.10| |conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|19.630|-| |conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU)|22.715|20.057|1.13| |conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU_FP16)|-|20.068|-| |conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|26.228|24.992|1.05| |conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|24.957|-| |conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|21.524|21.581|1.00| |conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|21.782|-| |conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|34.094|31.964|1.07| |conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|31.925|-| |conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|28.677|27.813|1.03| |conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|27.808|-| |conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|31.274|27.892|1.12| |conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|27.910|-| |conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|30.533|30.007|1.02| |conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|30.089|-| |conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|39.837|38.312|1.04| |conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|38.477|-| |conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU)|32.480|29.237|1.11| |conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU_FP16)|-|29.452|-| |conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU)|33.544|32.832|1.02| |conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|32.784|-| |conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU)|134.481|130.678|1.03| |conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU_FP16)|-|70.134|-| |conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|127.930|126.530|1.01| |conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU_FP16)|-|65.261|-| |conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|201.346|187.007|1.08| |conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU_FP16)|-|91.525|-| |conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU)|252.038|245.587|1.03| |conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU_FP16)|-|125.477|-| ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2023-05-17 14:38:33 +08:00
if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{
default_l1 = 7e-3;
default_lInf = 2e-2;
}
else
{
default_l1 = 1e-5;
default_lInf = 1e-4;
}
}
bool checkFallbacks(Net& net) const
{
// Check if all the layers are supported with current backend and target.
// Some layers might be fused so their timings equal to zero.
std::vector<double> timings;
net.getPerfProfile(timings);
std::vector<std::string> names = net.getLayerNames();
CV_CheckEQ(names.size(), timings.size(), "DNN critical error");
bool hasFallbacks = false;
for (int i = 0; i < names.size(); ++i)
{
Ptr<dnn::Layer> l = net.getLayer(net.getLayerId(names[i]));
bool fused = timings[i] == 0.;
if ((!l->supportBackend(backend) || l->preferableTarget != target) && !fused)
{
hasFallbacks = true;
Merge pull request #25880 from Jamim:fix/cuda-no-fp16 Fix CUDA for old GPUs without FP16 support #25880 Fixes #21461 ~This is a build-time solution that reflects https://github.com/opencv/opencv/blob/4.10.0/modules/dnn/src/cuda4dnn/init.hpp#L68-L82.~ ~We shouldn't add an invalid target while building with `CUDA_ARCH_BIN` < 53.~ _(please see [this discussion](https://github.com/opencv/opencv/pull/25880#discussion_r1668074505))_ This is a run-time solution that basically reverts [these lines](https://github.com/opencv/opencv/commit/d0fe6ad10967fd2b007a4cf83b00d6f8446deb42#diff-757c5ab6ddf2f99cdd09f851e3cf17abff203aff4107d908c7ad3d0466f39604L245-R245). I've debugged these changes, [coupled with other fixes](https://github.com/gentoo/gentoo/pull/37479), on [Gentoo Linux](https://www.gentoo.org/) and [related tests passed](https://github.com/user-attachments/files/16135391/opencv-4.10.0.20240708-224733.log.gz) on my laptop with `GeForce GTX 960M`. Alternative solution: - #21462 _Best regards!_ ### Pull Request Readiness Checklist - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [ ] `n/a` There is accuracy test, performance test and test data in opencv_extra repository, if applicable - [ ] `n/a` The feature is well documented and sample code can be built with the project CMake
2024-07-10 17:39:30 +08:00
std::cout << "FALLBACK: Layer [" << l->type << "]:[" << l->name << "] is expected to have backend implementation" << endl;
}
}
return hasFallbacks;
}
static void SetUpTestCase()
{
parser_deny_list = {
#include "test_onnx_conformance_layer_parser_denylist.inl.hpp"
};
global_deny_list = {
#include "test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp"
};
opencv_deny_list = {
#include "test_onnx_conformance_layer_filter_opencv_denylist.inl.hpp"
};
opencl_fp16_deny_list = {
#include "test_onnx_conformance_layer_filter_opencv_ocl_fp16_denylist.inl.hpp"
};
opencl_deny_list = {
#include "test_onnx_conformance_layer_filter_opencv_ocl_fp32_denylist.inl.hpp"
};
cpu_deny_list = {
#include "test_onnx_conformance_layer_filter_opencv_cpu_denylist.inl.hpp"
};
#ifdef HAVE_HALIDE
halide_deny_list = {
#include "test_onnx_conformance_layer_filter__halide_denylist.inl.hpp"
};
#endif
#ifdef HAVE_VULKAN
vulkan_deny_list = {
#include "test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp"
};
#endif
#ifdef HAVE_CUDA
cuda_deny_list = {
#include "test_onnx_conformance_layer_filter__cuda_denylist.inl.hpp"
};
Merge pull request #25630 from fengyuentau:nary-multi-thread dnn: parallelize nary elementwise forward implementation & enable related conformance tests #25630 This PR introduces the following changes: - [x] Parallelize binary forward impl - [x] Parallelize ternary forward impl (Where) - [x] Parallelize nary (Operator that can take >=1 operands) - [x] Enable conformance tests if workable ## Performance ### i7-12700K, RAM 64GB, Ubuntu 22.04 ``` Geometric mean (ms) Name of Test opencv opencv opencv perf perf perf core.x64.0606 core.x64.0606 core.x64.0606 vs opencv perf core.x64.0606 (x-factor) NCHW_C_sum::Layer_NaryEltwise::OCV/CPU 16.116 11.161 1.44 NCHW_NCHW_add::Layer_NaryEltwise::OCV/CPU 17.469 11.446 1.53 NCHW_NCHW_div::Layer_NaryEltwise::OCV/CPU 17.531 11.469 1.53 NCHW_NCHW_equal::Layer_NaryEltwise::OCV/CPU 28.653 13.682 2.09 NCHW_NCHW_greater::Layer_NaryEltwise::OCV/CPU 21.899 13.422 1.63 NCHW_NCHW_less::Layer_NaryEltwise::OCV/CPU 21.738 13.185 1.65 NCHW_NCHW_max::Layer_NaryEltwise::OCV/CPU 16.172 11.473 1.41 NCHW_NCHW_mean::Layer_NaryEltwise::OCV/CPU 16.309 11.565 1.41 NCHW_NCHW_min::Layer_NaryEltwise::OCV/CPU 16.166 11.454 1.41 NCHW_NCHW_mul::Layer_NaryEltwise::OCV/CPU 16.157 11.443 1.41 NCHW_NCHW_pow::Layer_NaryEltwise::OCV/CPU 163.459 15.234 10.73 NCHW_NCHW_ref_div::Layer_NaryEltwise::OCV/CPU 10.880 10.868 1.00 NCHW_NCHW_ref_max::Layer_NaryEltwise::OCV/CPU 10.947 11.058 0.99 NCHW_NCHW_ref_min::Layer_NaryEltwise::OCV/CPU 10.948 10.910 1.00 NCHW_NCHW_ref_mul::Layer_NaryEltwise::OCV/CPU 10.874 10.871 1.00 NCHW_NCHW_ref_sum::Layer_NaryEltwise::OCV/CPU 10.971 10.920 1.00 NCHW_NCHW_sub::Layer_NaryEltwise::OCV/CPU 17.546 11.462 1.53 NCHW_NCHW_sum::Layer_NaryEltwise::OCV/CPU 16.175 11.475 1.41 NHWC_C::Layer_NaryEltwise::OCV/CPU 11.339 11.333 1.00 NHWC_H::Layer_NaryEltwise::OCV/CPU 16.154 11.102 1.46 ``` ### Apple M1, RAM 16GB, macOS 14.4.1 ``` Geometric mean (ms) Name of Test opencv opencv opencv perf perf perf core.m1.0606 core.m1.0606.patch core.m1.0606.patch vs opencv perf core.m1.0606 (x-factor) NCHW_C_sum::Layer_NaryEltwise::OCV/CPU 28.418 3.768 7.54 NCHW_NCHW_add::Layer_NaryEltwise::OCV/CPU 6.942 5.679 1.22 NCHW_NCHW_div::Layer_NaryEltwise::OCV/CPU 5.822 5.653 1.03 NCHW_NCHW_equal::Layer_NaryEltwise::OCV/CPU 5.751 5.628 1.02 NCHW_NCHW_greater::Layer_NaryEltwise::OCV/CPU 5.797 5.599 1.04 NCHW_NCHW_less::Layer_NaryEltwise::OCV/CPU 7.272 5.578 1.30 NCHW_NCHW_max::Layer_NaryEltwise::OCV/CPU 5.777 5.562 1.04 NCHW_NCHW_mean::Layer_NaryEltwise::OCV/CPU 5.819 5.559 1.05 NCHW_NCHW_min::Layer_NaryEltwise::OCV/CPU 5.830 5.574 1.05 NCHW_NCHW_mul::Layer_NaryEltwise::OCV/CPU 5.759 5.567 1.03 NCHW_NCHW_pow::Layer_NaryEltwise::OCV/CPU 342.260 74.655 4.58 NCHW_NCHW_ref_div::Layer_NaryEltwise::OCV/CPU 8.338 8.280 1.01 NCHW_NCHW_ref_max::Layer_NaryEltwise::OCV/CPU 8.359 8.309 1.01 NCHW_NCHW_ref_min::Layer_NaryEltwise::OCV/CPU 8.412 8.295 1.01 NCHW_NCHW_ref_mul::Layer_NaryEltwise::OCV/CPU 8.380 8.297 1.01 NCHW_NCHW_ref_sum::Layer_NaryEltwise::OCV/CPU 8.356 8.323 1.00 NCHW_NCHW_sub::Layer_NaryEltwise::OCV/CPU 6.818 5.561 1.23 NCHW_NCHW_sum::Layer_NaryEltwise::OCV/CPU 5.805 5.570 1.04 NHWC_C::Layer_NaryEltwise::OCV/CPU 3.834 4.817 0.80 NHWC_H::Layer_NaryEltwise::OCV/CPU 28.402 3.771 7.53 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2024-07-03 15:09:05 +08:00
cuda_fp16_deny_list = {
#include "test_onnx_conformance_layer_filter__cuda_fp16_denylist.inl.hpp"
};
#endif
}
};
std::set<std::string> Test_ONNX_conformance::parser_deny_list;
std::set<std::string> Test_ONNX_conformance::global_deny_list;
std::set<std::string> Test_ONNX_conformance::opencv_deny_list;
std::set<std::string> Test_ONNX_conformance::opencl_fp16_deny_list;
std::set<std::string> Test_ONNX_conformance::opencl_deny_list;
std::set<std::string> Test_ONNX_conformance::cpu_deny_list;
#ifdef HAVE_HALIDE
std::set<std::string> Test_ONNX_conformance::halide_deny_list;
#endif
#ifdef HAVE_VULKAN
std::set<std::string> Test_ONNX_conformance::vulkan_deny_list;
#endif
#ifdef HAVE_CUDA
std::set<std::string> Test_ONNX_conformance::cuda_deny_list;
Merge pull request #25630 from fengyuentau:nary-multi-thread dnn: parallelize nary elementwise forward implementation & enable related conformance tests #25630 This PR introduces the following changes: - [x] Parallelize binary forward impl - [x] Parallelize ternary forward impl (Where) - [x] Parallelize nary (Operator that can take >=1 operands) - [x] Enable conformance tests if workable ## Performance ### i7-12700K, RAM 64GB, Ubuntu 22.04 ``` Geometric mean (ms) Name of Test opencv opencv opencv perf perf perf core.x64.0606 core.x64.0606 core.x64.0606 vs opencv perf core.x64.0606 (x-factor) NCHW_C_sum::Layer_NaryEltwise::OCV/CPU 16.116 11.161 1.44 NCHW_NCHW_add::Layer_NaryEltwise::OCV/CPU 17.469 11.446 1.53 NCHW_NCHW_div::Layer_NaryEltwise::OCV/CPU 17.531 11.469 1.53 NCHW_NCHW_equal::Layer_NaryEltwise::OCV/CPU 28.653 13.682 2.09 NCHW_NCHW_greater::Layer_NaryEltwise::OCV/CPU 21.899 13.422 1.63 NCHW_NCHW_less::Layer_NaryEltwise::OCV/CPU 21.738 13.185 1.65 NCHW_NCHW_max::Layer_NaryEltwise::OCV/CPU 16.172 11.473 1.41 NCHW_NCHW_mean::Layer_NaryEltwise::OCV/CPU 16.309 11.565 1.41 NCHW_NCHW_min::Layer_NaryEltwise::OCV/CPU 16.166 11.454 1.41 NCHW_NCHW_mul::Layer_NaryEltwise::OCV/CPU 16.157 11.443 1.41 NCHW_NCHW_pow::Layer_NaryEltwise::OCV/CPU 163.459 15.234 10.73 NCHW_NCHW_ref_div::Layer_NaryEltwise::OCV/CPU 10.880 10.868 1.00 NCHW_NCHW_ref_max::Layer_NaryEltwise::OCV/CPU 10.947 11.058 0.99 NCHW_NCHW_ref_min::Layer_NaryEltwise::OCV/CPU 10.948 10.910 1.00 NCHW_NCHW_ref_mul::Layer_NaryEltwise::OCV/CPU 10.874 10.871 1.00 NCHW_NCHW_ref_sum::Layer_NaryEltwise::OCV/CPU 10.971 10.920 1.00 NCHW_NCHW_sub::Layer_NaryEltwise::OCV/CPU 17.546 11.462 1.53 NCHW_NCHW_sum::Layer_NaryEltwise::OCV/CPU 16.175 11.475 1.41 NHWC_C::Layer_NaryEltwise::OCV/CPU 11.339 11.333 1.00 NHWC_H::Layer_NaryEltwise::OCV/CPU 16.154 11.102 1.46 ``` ### Apple M1, RAM 16GB, macOS 14.4.1 ``` Geometric mean (ms) Name of Test opencv opencv opencv perf perf perf core.m1.0606 core.m1.0606.patch core.m1.0606.patch vs opencv perf core.m1.0606 (x-factor) NCHW_C_sum::Layer_NaryEltwise::OCV/CPU 28.418 3.768 7.54 NCHW_NCHW_add::Layer_NaryEltwise::OCV/CPU 6.942 5.679 1.22 NCHW_NCHW_div::Layer_NaryEltwise::OCV/CPU 5.822 5.653 1.03 NCHW_NCHW_equal::Layer_NaryEltwise::OCV/CPU 5.751 5.628 1.02 NCHW_NCHW_greater::Layer_NaryEltwise::OCV/CPU 5.797 5.599 1.04 NCHW_NCHW_less::Layer_NaryEltwise::OCV/CPU 7.272 5.578 1.30 NCHW_NCHW_max::Layer_NaryEltwise::OCV/CPU 5.777 5.562 1.04 NCHW_NCHW_mean::Layer_NaryEltwise::OCV/CPU 5.819 5.559 1.05 NCHW_NCHW_min::Layer_NaryEltwise::OCV/CPU 5.830 5.574 1.05 NCHW_NCHW_mul::Layer_NaryEltwise::OCV/CPU 5.759 5.567 1.03 NCHW_NCHW_pow::Layer_NaryEltwise::OCV/CPU 342.260 74.655 4.58 NCHW_NCHW_ref_div::Layer_NaryEltwise::OCV/CPU 8.338 8.280 1.01 NCHW_NCHW_ref_max::Layer_NaryEltwise::OCV/CPU 8.359 8.309 1.01 NCHW_NCHW_ref_min::Layer_NaryEltwise::OCV/CPU 8.412 8.295 1.01 NCHW_NCHW_ref_mul::Layer_NaryEltwise::OCV/CPU 8.380 8.297 1.01 NCHW_NCHW_ref_sum::Layer_NaryEltwise::OCV/CPU 8.356 8.323 1.00 NCHW_NCHW_sub::Layer_NaryEltwise::OCV/CPU 6.818 5.561 1.23 NCHW_NCHW_sum::Layer_NaryEltwise::OCV/CPU 5.805 5.570 1.04 NHWC_C::Layer_NaryEltwise::OCV/CPU 3.834 4.817 0.80 NHWC_H::Layer_NaryEltwise::OCV/CPU 28.402 3.771 7.53 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2024-07-03 15:09:05 +08:00
std::set<std::string> Test_ONNX_conformance::cuda_fp16_deny_list;
#endif
TEST_P(Test_ONNX_conformance, Layer_Test)
{
const std::string& name = test_case.name;
ASSERT_FALSE(name.empty());
bool checkLayersFallbacks = true;
bool checkAccuracy = true;
// SKIP when the test case is in the parser deny list.
if (parser_deny_list.find(name) != parser_deny_list.end())
{
applyTestTag(CV_TEST_TAG_DNN_SKIP_PARSER, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE);
}
// SKIP when the test case is in the global deny list.
if (global_deny_list.find(name) != global_deny_list.end())
{
applyTestTag(CV_TEST_TAG_DNN_SKIP_GLOBAL, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE);
}
if (backend == DNN_BACKEND_OPENCV)
{
if (opencv_deny_list.find(name) != opencv_deny_list.end())
{
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCV_BACKEND, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE);
}
if ((target == DNN_TARGET_OPENCL_FP16) && (opencl_fp16_deny_list.find(name) != opencl_fp16_deny_list.end()))
{
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_OPENCV_BACKEND, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE);
}
if ((target == DNN_TARGET_OPENCL) && (opencl_deny_list.find(name) != opencl_deny_list.end()))
{
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL, CV_TEST_TAG_DNN_SKIP_OPENCV_BACKEND, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE);
}
if ((target == DNN_TARGET_CPU) && (cpu_deny_list.find(name) != cpu_deny_list.end()))
{
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU, CV_TEST_TAG_DNN_SKIP_OPENCV_BACKEND, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE);
}
Merge pull request #25630 from fengyuentau:nary-multi-thread dnn: parallelize nary elementwise forward implementation & enable related conformance tests #25630 This PR introduces the following changes: - [x] Parallelize binary forward impl - [x] Parallelize ternary forward impl (Where) - [x] Parallelize nary (Operator that can take >=1 operands) - [x] Enable conformance tests if workable ## Performance ### i7-12700K, RAM 64GB, Ubuntu 22.04 ``` Geometric mean (ms) Name of Test opencv opencv opencv perf perf perf core.x64.0606 core.x64.0606 core.x64.0606 vs opencv perf core.x64.0606 (x-factor) NCHW_C_sum::Layer_NaryEltwise::OCV/CPU 16.116 11.161 1.44 NCHW_NCHW_add::Layer_NaryEltwise::OCV/CPU 17.469 11.446 1.53 NCHW_NCHW_div::Layer_NaryEltwise::OCV/CPU 17.531 11.469 1.53 NCHW_NCHW_equal::Layer_NaryEltwise::OCV/CPU 28.653 13.682 2.09 NCHW_NCHW_greater::Layer_NaryEltwise::OCV/CPU 21.899 13.422 1.63 NCHW_NCHW_less::Layer_NaryEltwise::OCV/CPU 21.738 13.185 1.65 NCHW_NCHW_max::Layer_NaryEltwise::OCV/CPU 16.172 11.473 1.41 NCHW_NCHW_mean::Layer_NaryEltwise::OCV/CPU 16.309 11.565 1.41 NCHW_NCHW_min::Layer_NaryEltwise::OCV/CPU 16.166 11.454 1.41 NCHW_NCHW_mul::Layer_NaryEltwise::OCV/CPU 16.157 11.443 1.41 NCHW_NCHW_pow::Layer_NaryEltwise::OCV/CPU 163.459 15.234 10.73 NCHW_NCHW_ref_div::Layer_NaryEltwise::OCV/CPU 10.880 10.868 1.00 NCHW_NCHW_ref_max::Layer_NaryEltwise::OCV/CPU 10.947 11.058 0.99 NCHW_NCHW_ref_min::Layer_NaryEltwise::OCV/CPU 10.948 10.910 1.00 NCHW_NCHW_ref_mul::Layer_NaryEltwise::OCV/CPU 10.874 10.871 1.00 NCHW_NCHW_ref_sum::Layer_NaryEltwise::OCV/CPU 10.971 10.920 1.00 NCHW_NCHW_sub::Layer_NaryEltwise::OCV/CPU 17.546 11.462 1.53 NCHW_NCHW_sum::Layer_NaryEltwise::OCV/CPU 16.175 11.475 1.41 NHWC_C::Layer_NaryEltwise::OCV/CPU 11.339 11.333 1.00 NHWC_H::Layer_NaryEltwise::OCV/CPU 16.154 11.102 1.46 ``` ### Apple M1, RAM 16GB, macOS 14.4.1 ``` Geometric mean (ms) Name of Test opencv opencv opencv perf perf perf core.m1.0606 core.m1.0606.patch core.m1.0606.patch vs opencv perf core.m1.0606 (x-factor) NCHW_C_sum::Layer_NaryEltwise::OCV/CPU 28.418 3.768 7.54 NCHW_NCHW_add::Layer_NaryEltwise::OCV/CPU 6.942 5.679 1.22 NCHW_NCHW_div::Layer_NaryEltwise::OCV/CPU 5.822 5.653 1.03 NCHW_NCHW_equal::Layer_NaryEltwise::OCV/CPU 5.751 5.628 1.02 NCHW_NCHW_greater::Layer_NaryEltwise::OCV/CPU 5.797 5.599 1.04 NCHW_NCHW_less::Layer_NaryEltwise::OCV/CPU 7.272 5.578 1.30 NCHW_NCHW_max::Layer_NaryEltwise::OCV/CPU 5.777 5.562 1.04 NCHW_NCHW_mean::Layer_NaryEltwise::OCV/CPU 5.819 5.559 1.05 NCHW_NCHW_min::Layer_NaryEltwise::OCV/CPU 5.830 5.574 1.05 NCHW_NCHW_mul::Layer_NaryEltwise::OCV/CPU 5.759 5.567 1.03 NCHW_NCHW_pow::Layer_NaryEltwise::OCV/CPU 342.260 74.655 4.58 NCHW_NCHW_ref_div::Layer_NaryEltwise::OCV/CPU 8.338 8.280 1.01 NCHW_NCHW_ref_max::Layer_NaryEltwise::OCV/CPU 8.359 8.309 1.01 NCHW_NCHW_ref_min::Layer_NaryEltwise::OCV/CPU 8.412 8.295 1.01 NCHW_NCHW_ref_mul::Layer_NaryEltwise::OCV/CPU 8.380 8.297 1.01 NCHW_NCHW_ref_sum::Layer_NaryEltwise::OCV/CPU 8.356 8.323 1.00 NCHW_NCHW_sub::Layer_NaryEltwise::OCV/CPU 6.818 5.561 1.23 NCHW_NCHW_sum::Layer_NaryEltwise::OCV/CPU 5.805 5.570 1.04 NHWC_C::Layer_NaryEltwise::OCV/CPU 3.834 4.817 0.80 NHWC_H::Layer_NaryEltwise::OCV/CPU 28.402 3.771 7.53 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2024-07-03 15:09:05 +08:00
if (name == "test_pow") {
default_lInf = 0.00013; // Expected: (normInf) <= (lInf), actual: 0.00012207 vs 0.0001
}
if (name == "test_gelu_tanh_1") {
default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101805 vs 1e-05
default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152707 vs 0.0001
}
if (name == "test_gelu_tanh_2") {
if (target == DNN_TARGET_OPENCL_FP16) {
default_l1 = 0.00016; // Expected: (normL1) <= (l1), actual: 0.000157223 vs 9e-05
default_lInf = 0.0016; // Expected: (normInf) <= (lInf), actual: 0.00153041 vs 0.0005
} else {
default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80073e-05 vs 1e-05
default_lInf = 0.0005; // Expected: (normInf) <= (lInf), actual: 0.000455521 vs 0.0001
}
}
}
#ifdef HAVE_HALIDE
else if (backend == DNN_BACKEND_HALIDE)
{
if (halide_deny_list.find(name) != halide_deny_list.end())
{
applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE);
}
}
#endif
#ifdef HAVE_INF_ENGINE
else if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
{
#include "test_onnx_conformance_layer_filter__openvino.inl.hpp"
}
#endif
#ifdef HAVE_VULKAN
else if (backend == DNN_BACKEND_VKCOM)
{
if (vulkan_deny_list.find(name) != vulkan_deny_list.end())
{
applyTestTag(CV_TEST_TAG_DNN_SKIP_VULKAN, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE);
}
if (name == "test_gelu_tanh_1") {
default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101805 vs 1e-05
default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152707 vs 0.0001
}
if (name == "test_gelu_tanh_2") {
default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80073e-05 vs 1e-05
default_lInf = 0.0005; // Expected: (normInf) <= (lInf), actual: 0.000455521 vs 0.0001
}
}
#endif
#ifdef HAVE_CUDA
else if (backend == DNN_BACKEND_CUDA)
{
Merge pull request #25630 from fengyuentau:nary-multi-thread dnn: parallelize nary elementwise forward implementation & enable related conformance tests #25630 This PR introduces the following changes: - [x] Parallelize binary forward impl - [x] Parallelize ternary forward impl (Where) - [x] Parallelize nary (Operator that can take >=1 operands) - [x] Enable conformance tests if workable ## Performance ### i7-12700K, RAM 64GB, Ubuntu 22.04 ``` Geometric mean (ms) Name of Test opencv opencv opencv perf perf perf core.x64.0606 core.x64.0606 core.x64.0606 vs opencv perf core.x64.0606 (x-factor) NCHW_C_sum::Layer_NaryEltwise::OCV/CPU 16.116 11.161 1.44 NCHW_NCHW_add::Layer_NaryEltwise::OCV/CPU 17.469 11.446 1.53 NCHW_NCHW_div::Layer_NaryEltwise::OCV/CPU 17.531 11.469 1.53 NCHW_NCHW_equal::Layer_NaryEltwise::OCV/CPU 28.653 13.682 2.09 NCHW_NCHW_greater::Layer_NaryEltwise::OCV/CPU 21.899 13.422 1.63 NCHW_NCHW_less::Layer_NaryEltwise::OCV/CPU 21.738 13.185 1.65 NCHW_NCHW_max::Layer_NaryEltwise::OCV/CPU 16.172 11.473 1.41 NCHW_NCHW_mean::Layer_NaryEltwise::OCV/CPU 16.309 11.565 1.41 NCHW_NCHW_min::Layer_NaryEltwise::OCV/CPU 16.166 11.454 1.41 NCHW_NCHW_mul::Layer_NaryEltwise::OCV/CPU 16.157 11.443 1.41 NCHW_NCHW_pow::Layer_NaryEltwise::OCV/CPU 163.459 15.234 10.73 NCHW_NCHW_ref_div::Layer_NaryEltwise::OCV/CPU 10.880 10.868 1.00 NCHW_NCHW_ref_max::Layer_NaryEltwise::OCV/CPU 10.947 11.058 0.99 NCHW_NCHW_ref_min::Layer_NaryEltwise::OCV/CPU 10.948 10.910 1.00 NCHW_NCHW_ref_mul::Layer_NaryEltwise::OCV/CPU 10.874 10.871 1.00 NCHW_NCHW_ref_sum::Layer_NaryEltwise::OCV/CPU 10.971 10.920 1.00 NCHW_NCHW_sub::Layer_NaryEltwise::OCV/CPU 17.546 11.462 1.53 NCHW_NCHW_sum::Layer_NaryEltwise::OCV/CPU 16.175 11.475 1.41 NHWC_C::Layer_NaryEltwise::OCV/CPU 11.339 11.333 1.00 NHWC_H::Layer_NaryEltwise::OCV/CPU 16.154 11.102 1.46 ``` ### Apple M1, RAM 16GB, macOS 14.4.1 ``` Geometric mean (ms) Name of Test opencv opencv opencv perf perf perf core.m1.0606 core.m1.0606.patch core.m1.0606.patch vs opencv perf core.m1.0606 (x-factor) NCHW_C_sum::Layer_NaryEltwise::OCV/CPU 28.418 3.768 7.54 NCHW_NCHW_add::Layer_NaryEltwise::OCV/CPU 6.942 5.679 1.22 NCHW_NCHW_div::Layer_NaryEltwise::OCV/CPU 5.822 5.653 1.03 NCHW_NCHW_equal::Layer_NaryEltwise::OCV/CPU 5.751 5.628 1.02 NCHW_NCHW_greater::Layer_NaryEltwise::OCV/CPU 5.797 5.599 1.04 NCHW_NCHW_less::Layer_NaryEltwise::OCV/CPU 7.272 5.578 1.30 NCHW_NCHW_max::Layer_NaryEltwise::OCV/CPU 5.777 5.562 1.04 NCHW_NCHW_mean::Layer_NaryEltwise::OCV/CPU 5.819 5.559 1.05 NCHW_NCHW_min::Layer_NaryEltwise::OCV/CPU 5.830 5.574 1.05 NCHW_NCHW_mul::Layer_NaryEltwise::OCV/CPU 5.759 5.567 1.03 NCHW_NCHW_pow::Layer_NaryEltwise::OCV/CPU 342.260 74.655 4.58 NCHW_NCHW_ref_div::Layer_NaryEltwise::OCV/CPU 8.338 8.280 1.01 NCHW_NCHW_ref_max::Layer_NaryEltwise::OCV/CPU 8.359 8.309 1.01 NCHW_NCHW_ref_min::Layer_NaryEltwise::OCV/CPU 8.412 8.295 1.01 NCHW_NCHW_ref_mul::Layer_NaryEltwise::OCV/CPU 8.380 8.297 1.01 NCHW_NCHW_ref_sum::Layer_NaryEltwise::OCV/CPU 8.356 8.323 1.00 NCHW_NCHW_sub::Layer_NaryEltwise::OCV/CPU 6.818 5.561 1.23 NCHW_NCHW_sum::Layer_NaryEltwise::OCV/CPU 5.805 5.570 1.04 NHWC_C::Layer_NaryEltwise::OCV/CPU 3.834 4.817 0.80 NHWC_H::Layer_NaryEltwise::OCV/CPU 28.402 3.771 7.53 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2024-07-03 15:09:05 +08:00
if (target == DNN_TARGET_CUDA && cuda_deny_list.find(name) != cuda_deny_list.end())
{
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE);
}
Merge pull request #25630 from fengyuentau:nary-multi-thread dnn: parallelize nary elementwise forward implementation & enable related conformance tests #25630 This PR introduces the following changes: - [x] Parallelize binary forward impl - [x] Parallelize ternary forward impl (Where) - [x] Parallelize nary (Operator that can take >=1 operands) - [x] Enable conformance tests if workable ## Performance ### i7-12700K, RAM 64GB, Ubuntu 22.04 ``` Geometric mean (ms) Name of Test opencv opencv opencv perf perf perf core.x64.0606 core.x64.0606 core.x64.0606 vs opencv perf core.x64.0606 (x-factor) NCHW_C_sum::Layer_NaryEltwise::OCV/CPU 16.116 11.161 1.44 NCHW_NCHW_add::Layer_NaryEltwise::OCV/CPU 17.469 11.446 1.53 NCHW_NCHW_div::Layer_NaryEltwise::OCV/CPU 17.531 11.469 1.53 NCHW_NCHW_equal::Layer_NaryEltwise::OCV/CPU 28.653 13.682 2.09 NCHW_NCHW_greater::Layer_NaryEltwise::OCV/CPU 21.899 13.422 1.63 NCHW_NCHW_less::Layer_NaryEltwise::OCV/CPU 21.738 13.185 1.65 NCHW_NCHW_max::Layer_NaryEltwise::OCV/CPU 16.172 11.473 1.41 NCHW_NCHW_mean::Layer_NaryEltwise::OCV/CPU 16.309 11.565 1.41 NCHW_NCHW_min::Layer_NaryEltwise::OCV/CPU 16.166 11.454 1.41 NCHW_NCHW_mul::Layer_NaryEltwise::OCV/CPU 16.157 11.443 1.41 NCHW_NCHW_pow::Layer_NaryEltwise::OCV/CPU 163.459 15.234 10.73 NCHW_NCHW_ref_div::Layer_NaryEltwise::OCV/CPU 10.880 10.868 1.00 NCHW_NCHW_ref_max::Layer_NaryEltwise::OCV/CPU 10.947 11.058 0.99 NCHW_NCHW_ref_min::Layer_NaryEltwise::OCV/CPU 10.948 10.910 1.00 NCHW_NCHW_ref_mul::Layer_NaryEltwise::OCV/CPU 10.874 10.871 1.00 NCHW_NCHW_ref_sum::Layer_NaryEltwise::OCV/CPU 10.971 10.920 1.00 NCHW_NCHW_sub::Layer_NaryEltwise::OCV/CPU 17.546 11.462 1.53 NCHW_NCHW_sum::Layer_NaryEltwise::OCV/CPU 16.175 11.475 1.41 NHWC_C::Layer_NaryEltwise::OCV/CPU 11.339 11.333 1.00 NHWC_H::Layer_NaryEltwise::OCV/CPU 16.154 11.102 1.46 ``` ### Apple M1, RAM 16GB, macOS 14.4.1 ``` Geometric mean (ms) Name of Test opencv opencv opencv perf perf perf core.m1.0606 core.m1.0606.patch core.m1.0606.patch vs opencv perf core.m1.0606 (x-factor) NCHW_C_sum::Layer_NaryEltwise::OCV/CPU 28.418 3.768 7.54 NCHW_NCHW_add::Layer_NaryEltwise::OCV/CPU 6.942 5.679 1.22 NCHW_NCHW_div::Layer_NaryEltwise::OCV/CPU 5.822 5.653 1.03 NCHW_NCHW_equal::Layer_NaryEltwise::OCV/CPU 5.751 5.628 1.02 NCHW_NCHW_greater::Layer_NaryEltwise::OCV/CPU 5.797 5.599 1.04 NCHW_NCHW_less::Layer_NaryEltwise::OCV/CPU 7.272 5.578 1.30 NCHW_NCHW_max::Layer_NaryEltwise::OCV/CPU 5.777 5.562 1.04 NCHW_NCHW_mean::Layer_NaryEltwise::OCV/CPU 5.819 5.559 1.05 NCHW_NCHW_min::Layer_NaryEltwise::OCV/CPU 5.830 5.574 1.05 NCHW_NCHW_mul::Layer_NaryEltwise::OCV/CPU 5.759 5.567 1.03 NCHW_NCHW_pow::Layer_NaryEltwise::OCV/CPU 342.260 74.655 4.58 NCHW_NCHW_ref_div::Layer_NaryEltwise::OCV/CPU 8.338 8.280 1.01 NCHW_NCHW_ref_max::Layer_NaryEltwise::OCV/CPU 8.359 8.309 1.01 NCHW_NCHW_ref_min::Layer_NaryEltwise::OCV/CPU 8.412 8.295 1.01 NCHW_NCHW_ref_mul::Layer_NaryEltwise::OCV/CPU 8.380 8.297 1.01 NCHW_NCHW_ref_sum::Layer_NaryEltwise::OCV/CPU 8.356 8.323 1.00 NCHW_NCHW_sub::Layer_NaryEltwise::OCV/CPU 6.818 5.561 1.23 NCHW_NCHW_sum::Layer_NaryEltwise::OCV/CPU 5.805 5.570 1.04 NHWC_C::Layer_NaryEltwise::OCV/CPU 3.834 4.817 0.80 NHWC_H::Layer_NaryEltwise::OCV/CPU 28.402 3.771 7.53 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2024-07-03 15:09:05 +08:00
if (target == DNN_TARGET_CUDA_FP16 && cuda_fp16_deny_list.find(name) != cuda_fp16_deny_list.end())
{
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE);
}
if (name == "test_gelu_tanh_1") {
default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101815 vs 1e-05
default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152737 vs 0.0001
}
if (name == "test_gelu_tanh_2") {
if (target == DNN_TARGET_CUDA_FP16) {
default_l1 = 0.00023; // Expected: (normL1) <= (l1), actual: 0.000220591 vs 9e-05
default_lInf = 0.0023; // Expected: (normInf) <= (lInf), actual: 0.00220466 vs 0.0005
} else {
default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80127e-05 vs 1e-05
default_lInf = 0.0005; // Expected: (normInf) <= (lInf), actual: 0.000455445 vs 0.0001
}
}
}
#endif
else
{
std::ostringstream ss;
ss << "No test filter available for backend ";
PrintTo(backend, &ss);
ss << ". Run test by default";
std::cout << ss.str() << std::endl;
}
std::vector<Mat> inputs;
std::vector<Mat> ref_outputs;
std::string prefix = cv::format("dnn/onnx/conformance/node/%s", test_case.name);
Net net;
try
{
std::string model_path = findDataFile(prefix + "/model.onnx");
//cout << "Read ONNX inputs..." << endl;
for (int i = 0; i < test_case.inputs; ++i)
{
Mat input = readTensorFromONNX(findDataFile(prefix + cv::format("/test_data_set_0/input_%d.pb", i)));
inputs.push_back(input);
}
//cout << "Read ONNX reference outputs..." << endl;
for (int i = 0; i < test_case.outputs; ++i)
{
Mat output = readTensorFromONNX(findDataFile(prefix + cv::format("/test_data_set_0/output_%d.pb", i)));
ref_outputs.push_back(output);
}
//cout << "Parse model..." << endl;
net = readNetFromONNX(model_path);
if (net.empty())
{
applyTestTag(CV_TEST_TAG_DNN_ERROR_PARSER);
}
}
catch (...)
{
cout << "Exception during ONNX model parse / loading input / loading reference data!" << endl;
applyTestTag(CV_TEST_TAG_DNN_ERROR_PARSER);
throw;
}
ASSERT_FALSE(net.empty());
std::vector<std::string> inputNames;
for (int i = 0; i < inputs.size(); ++i)
inputNames.push_back(cv::format("%d", i));
net.setInputsNames(inputNames);
try
{
net.setPreferableBackend(backend);
net.setPreferableTarget(target);
for (int i = 0; i < inputs.size(); ++i)
{
net.setInput(inputs[i], inputNames[i]);
}
}
catch (...)
{
cout << "Exception during network configuration!" << endl;
applyTestTag(CV_TEST_TAG_DNN_ERROR_NET_SETUP);
throw;
}
std::vector<std::string> layerNames = net.getUnconnectedOutLayersNames();
std::vector<Mat> outputs;
try
{
net.forward(outputs, layerNames);
}
catch (...)
{
cout << "Exception during net.forward() call!" << endl;
applyTestTag(CV_TEST_TAG_DNN_ERROR_FORWARD);
throw;
}
ASSERT_GE(outputs.size(), 1);
if (checkLayersFallbacks && checkFallbacks(net))
{
applyTestTag(CV_TEST_TAG_DNN_LAYER_FALLBACK);
}
if (checkAccuracy)
{
try
{
if (ref_outputs.size() == 1)
{
// probably we found random unconnected layers.
normAssert(ref_outputs[0], outputs[0], "", default_l1, default_lInf);
}
else
{
ASSERT_EQ(outputs.size(), ref_outputs.size());
for (size_t i = 0; i < ref_outputs.size(); ++i)
{
normAssert(ref_outputs[i], outputs[i], "", default_l1, default_lInf);
}
}
}
catch (...)
{
cout << "Exception during accuracy check!" << endl;
throw;
}
}
else
{
applyTestTag(CV_TEST_TAG_DNN_NO_ACCURACY_CHECK);
}
if (!HasFailure())
cout << "Test passed!" << endl;
}
INSTANTIATE_TEST_CASE_P(/**/, Test_ONNX_conformance,
testing::Combine(
testing::ValuesIn(testConformanceConfig),
dnnBackendsAndTargets(/*withInferenceEngine=*/true, /*withHalide=*/true)
),
printOnnxConfParams
);
}