opencv/modules/dnn/perf/perf_gemm.cpp

250 lines
7.5 KiB
C++
Raw Normal View History

dnn: add gemm_layer in place of fully_connected_layer for onnx models (#23897) * first commit * turned C from input to constant; force C constant in impl; better handling 0d/1d cases * integrate with gemm from ficus nn * fix const inputs * adjust threshold for int8 tryQuantize * adjust threshold for int8 quantized 2 * support batched gemm and matmul; tune threshold for rcnn_ilsvrc13; update googlenet * add gemm perf against innerproduct * add perf tests for innerproduct with bias * fix perf * add memset * renamings for next step * add dedicated perf gemm * add innerproduct in perf_gemm * remove gemm and innerproduct perf tests from perf_layer * add perf cases for vit sizes; prepack constants * remove batched gemm; fix wrong trans; optimize KC * remove prepacking for const A; several fixes for const B prepacking * add todos and gemm expression * add optimized branch for avx/avx2 * trigger build * update macros and signature * update signature * fix macro * fix bugs for neon aarch64 & x64 * add backends: cuda, cann, inf_ngraph and vkcom * fix cuda backend * test commit for cuda * test cuda backend * remove debug message from cuda backend * use cpu dispatcher * fix neon macro undef in dispatcher * fix dispatcher * fix inner kernel for neon aarch64 * fix compiling issue on armv7; try fixing accuracy issue on other platforms * broadcast C with beta multiplied; improve func namings * fix bug for avx and avx2 * put all platform-specific kernels in dispatcher * fix typos * attempt to fix compile issues on x64 * run old gemm when neon, avx, avx2 are all not available; add kernel for armv7 neon * fix typo * quick fix: add macros for pack4 * quick fix: use vmlaq_f32 for armv7 * quick fix for missing macro of fast gemm pack f32 4 * disable conformance tests when optimized branches are not supported * disable perf tests when optimized branches are not supported * decouple cv_try_neon and cv_neon_aarch64 * drop googlenet_2023; add fastGemmBatched * fix step in fastGemmBatched * cpu: fix initialization ofb; gpu: support batch * quick followup fix for cuda * add default kernels * quick followup fix to avoid macro redef * optmized kernels for lasx * resolve mis-alignment; remove comments * tune performance for x64 platform * tune performance for neon aarch64 * tune for armv7 * comment time consuming tests * quick follow-up fix
2023-09-20 05:53:34 +08:00
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "perf_precomp.hpp"
#include <opencv2/dnn/shape_utils.hpp>
namespace opencv_test {
struct GemmParam_t {
std::vector<int> a_shape;
std::vector<int> b_shape;
std::vector<int> c_shape;
bool trans_a;
bool trans_b;
GemmParam_t(std::vector<int> a_shape_, std::vector<int> b_shape_, std::vector<int> c_shape_ = {}, bool trans_a_ = false, bool trans_b_ = false)
: a_shape(a_shape_), b_shape(b_shape_), c_shape(c_shape_), trans_a(trans_a_), trans_b(trans_b_) {}
};
// TODO: Dsiable most of the test cases except vision transformers to save time
static const GemmParam_t test_gemm_configs[] = {
// vision transformers cases
{ { 768, 768 }, { 768, 768 }, { 768 } },
{ { 1024, 1024 }, { 1024, 1024 }, { 1024 } },
{ { 50, 768 }, { 768, 2304 } },
{ { 197, 768 }, { 768, 2304 } },
{ { 50, 1024 }, { 1024, 3072 } },
{ { 197, 1024 }, { 1024, 3072 } },
// these cases are commented to save testing time
/*
// square mat
{ { 64, 64 }, { 64, 64 } },
{ { 128, 128 }, { 128, 128 } },
{ { 256, 256 }, { 256, 256 } },
{ { 512, 512 }, { 512, 512 } },
{ { 1024, 1024 }, { 1024, 1024 } },
{ { 4096, 4096 }, { 4096, 4096 } },
// retangular mat
{ { 256, 256 }, { 256, 1024 } },
{ { 256, 1024 }, { 1024, 256 } },
{ { 256, 1024 }, { 1024, 1024 } },
{ { 1024, 1024 }, { 1024, 256 } },
{ { 1024, 256 }, { 256, 1024 } },
{ { 1024, 256 }, { 256, 256 } },
// with C
{ { 256, 256 }, { 256, 256 }, { 256 } },
{ { 256, 256 }, { 256, 1024 }, { 1024 } },
{ { 256, 1024 }, { 1024, 256 }, { 256 } },
{ { 256, 1024 }, { 1024, 1024 }, { 1024 } },
{ { 1024, 1024 }, { 1024, 256 }, { 256 } },
{ { 1024, 256 }, { 256, 1024 }, { 1024 } },
{ { 1024, 256 }, { 256, 256 }, { 256 } },
// with C and trans_b
{ { 256, 256 }, { 256, 256 }, { 256 } , false, true},
{ { 256, 1024 }, { 256, 1024 }, { 256 } , false, true},
{ { 256, 1024 }, { 1024, 1024 }, { 1024 } , false, true},
{ { 1024, 1024 }, { 1024, 1024 }, { 1024 } , false, true},
{ { 1024, 256 }, { 1024, 256 }, { 1024 } , false, true},
{ { 1024, 256 }, { 256, 256 }, { 256 } , false, true},
// with C and trans_b and trans_a
{ { 256, 256 }, { 256, 256 }, { 256 } , true, true},
{ { 1024, 256 }, { 256, 1024 }, { 256 } , true, true},
{ { 256, 1024 }, { 1024, 256 }, { 1024 } , true, true},
{ { 1024, 1024 }, { 1024, 1024 }, { 1024 } , true, true},
*/
};
struct GemmParamId
{
enum {
GEMM_0 = 0,
GEMM_LAST = sizeof(test_gemm_configs) / sizeof(test_gemm_configs[0])
};
int val_;
GemmParamId(int val = 0) : val_(val) {}
operator int() const { return val_; }
static ::testing::internal::ParamGenerator<GemmParamId> all()
{
enum { NUM = (int)GEMM_LAST };
GemmParamId v_[NUM]; for (int i = 0; i < NUM; ++i) { v_[i] = GemmParamId(i); } // reduce generated code size
return ::testing::ValuesIn(v_, v_ + NUM);
}
};
static inline void PrintTo(const GemmParamId& v, std::ostream* os)
{
CV_Assert((int)v >= 0); CV_Assert((int)v < GemmParamId::GEMM_LAST);
const GemmParam_t& p = test_gemm_configs[(int)v];
auto print_shape = [os](const std::vector<int>& shape, const std::string tag) {
if (shape.empty()) {
return ;
}
*os << tag << "=[";
for (size_t i = 0; i < shape.size(); ++i) {
if (i == shape.size() - 1) {
*os << shape[i] << "]";
break;
}
*os << shape[i] << ", ";
}
};
print_shape(p.a_shape, "A");
print_shape(p.b_shape, ", B");
print_shape(p.c_shape, ", C");
*os << ", trans_a=" << p.trans_a << ", trans_b=" << p.trans_b;
}
typedef tuple<GemmParamId, tuple<Backend, Target> > GemmTestParam_t;
typedef TestBaseWithParam<GemmTestParam_t> Gemm;
PERF_TEST_P_(Gemm, gemm)
{
int test_id = (int)get<0>(GetParam());
ASSERT_GE(test_id, 0); ASSERT_LT(test_id, GemmParamId::GEMM_LAST);
const GemmParam_t& params = test_gemm_configs[test_id];
auto a_shape = params.a_shape;
auto b_shape = params.b_shape;
auto c_shape = params.c_shape;
auto trans_a = params.trans_a;
auto trans_b = params.trans_b;
float alpha = 1.f;
float beta = 1.f;
Backend backend_id = get<0>(get<1>(GetParam()));
Target target_id = get<1>(get<1>(GetParam()));
bool have_bias = c_shape.empty() ? false : true;
Mat A(static_cast<int>(a_shape.size()), a_shape.data(), CV_32F);
randu(A, -1.0f, 1.0f);
Mat B(static_cast<int>(b_shape.size()), b_shape.data(), CV_32F);
randu(A, -1.0f, 1.0f);
LayerParams lp;
lp.type = "Gemm";
lp.name = "testLayer";
lp.set("transA", trans_a);
lp.set("transB", trans_b);
lp.set("alpha", alpha);
lp.set("beta", beta);
lp.set("real_ndims_C", static_cast<int>(c_shape.size()));
lp.set("constB", true);
lp.blobs.push_back(B);
if (have_bias) {
Mat C(static_cast<int>(c_shape.size()), c_shape.data(), CV_32F);
randu(C, -1.0f, 1.0f);
lp.set("have_bias", true);
lp.set("constC", true);
lp.blobs.push_back(C);
}
Net net;
net.addLayerToPrev(lp.name, lp.type, lp);
dnn: add gemm_layer in place of fully_connected_layer for onnx models (#23897) * first commit * turned C from input to constant; force C constant in impl; better handling 0d/1d cases * integrate with gemm from ficus nn * fix const inputs * adjust threshold for int8 tryQuantize * adjust threshold for int8 quantized 2 * support batched gemm and matmul; tune threshold for rcnn_ilsvrc13; update googlenet * add gemm perf against innerproduct * add perf tests for innerproduct with bias * fix perf * add memset * renamings for next step * add dedicated perf gemm * add innerproduct in perf_gemm * remove gemm and innerproduct perf tests from perf_layer * add perf cases for vit sizes; prepack constants * remove batched gemm; fix wrong trans; optimize KC * remove prepacking for const A; several fixes for const B prepacking * add todos and gemm expression * add optimized branch for avx/avx2 * trigger build * update macros and signature * update signature * fix macro * fix bugs for neon aarch64 & x64 * add backends: cuda, cann, inf_ngraph and vkcom * fix cuda backend * test commit for cuda * test cuda backend * remove debug message from cuda backend * use cpu dispatcher * fix neon macro undef in dispatcher * fix dispatcher * fix inner kernel for neon aarch64 * fix compiling issue on armv7; try fixing accuracy issue on other platforms * broadcast C with beta multiplied; improve func namings * fix bug for avx and avx2 * put all platform-specific kernels in dispatcher * fix typos * attempt to fix compile issues on x64 * run old gemm when neon, avx, avx2 are all not available; add kernel for armv7 neon * fix typo * quick fix: add macros for pack4 * quick fix: use vmlaq_f32 for armv7 * quick fix for missing macro of fast gemm pack f32 4 * disable conformance tests when optimized branches are not supported * disable perf tests when optimized branches are not supported * decouple cv_try_neon and cv_neon_aarch64 * drop googlenet_2023; add fastGemmBatched * fix step in fastGemmBatched * cpu: fix initialization ofb; gpu: support batch * quick followup fix for cuda * add default kernels * quick followup fix to avoid macro redef * optmized kernels for lasx * resolve mis-alignment; remove comments * tune performance for x64 platform * tune performance for neon aarch64 * tune for armv7 * comment time consuming tests * quick follow-up fix
2023-09-20 05:53:34 +08:00
net.setPreferableBackend(backend_id);
net.setPreferableTarget(target_id);
// warmup
{
net.setInput(A);
Mat out = net.forward();
}
TEST_CYCLE()
{
Mat res = net.forward();
}
SANITY_CHECK_NOTHING();
}
PERF_TEST_P_(Gemm, innerproduct)
{
int test_id = (int)get<0>(GetParam());
ASSERT_GE(test_id, 0); ASSERT_LT(test_id, GemmParamId::GEMM_LAST);
const GemmParam_t& params = test_gemm_configs[test_id];
auto a_shape = params.a_shape;
auto b_shape = params.b_shape;
auto c_shape = params.c_shape;
auto trans_a = params.trans_a;
auto trans_b = params.trans_b;
Backend backend_id = get<0>(get<1>(GetParam()));
Target target_id = get<1>(get<1>(GetParam()));
bool have_bias = c_shape.empty() ? false : true;
Mat A(static_cast<int>(a_shape.size()), a_shape.data(), CV_32F);
randu(A, -1.0f, 1.0f);
Mat B(static_cast<int>(b_shape.size()), b_shape.data(), CV_32F);
randu(A, -1.0f, 1.0f);
LayerParams lp;
lp.type = "InnerProduct";
lp.name = "testLayer";
if (trans_a) {
cv::transpose(A, A);
}
if (!trans_b) {
cv::transpose(B, B);
}
lp.blobs.push_back(B);
lp.set("num_output", B.size[0]);
if (have_bias) {
Mat C(static_cast<int>(c_shape.size()), c_shape.data(), CV_32F);
randu(C, -1.0f, 1.0f);
lp.blobs.push_back(C);
lp.set("bias_term", true);
} else {
lp.set("bias_term", false);
}
Net net;
net.addLayerToPrev(lp.name, lp.type, lp);
dnn: add gemm_layer in place of fully_connected_layer for onnx models (#23897) * first commit * turned C from input to constant; force C constant in impl; better handling 0d/1d cases * integrate with gemm from ficus nn * fix const inputs * adjust threshold for int8 tryQuantize * adjust threshold for int8 quantized 2 * support batched gemm and matmul; tune threshold for rcnn_ilsvrc13; update googlenet * add gemm perf against innerproduct * add perf tests for innerproduct with bias * fix perf * add memset * renamings for next step * add dedicated perf gemm * add innerproduct in perf_gemm * remove gemm and innerproduct perf tests from perf_layer * add perf cases for vit sizes; prepack constants * remove batched gemm; fix wrong trans; optimize KC * remove prepacking for const A; several fixes for const B prepacking * add todos and gemm expression * add optimized branch for avx/avx2 * trigger build * update macros and signature * update signature * fix macro * fix bugs for neon aarch64 & x64 * add backends: cuda, cann, inf_ngraph and vkcom * fix cuda backend * test commit for cuda * test cuda backend * remove debug message from cuda backend * use cpu dispatcher * fix neon macro undef in dispatcher * fix dispatcher * fix inner kernel for neon aarch64 * fix compiling issue on armv7; try fixing accuracy issue on other platforms * broadcast C with beta multiplied; improve func namings * fix bug for avx and avx2 * put all platform-specific kernels in dispatcher * fix typos * attempt to fix compile issues on x64 * run old gemm when neon, avx, avx2 are all not available; add kernel for armv7 neon * fix typo * quick fix: add macros for pack4 * quick fix: use vmlaq_f32 for armv7 * quick fix for missing macro of fast gemm pack f32 4 * disable conformance tests when optimized branches are not supported * disable perf tests when optimized branches are not supported * decouple cv_try_neon and cv_neon_aarch64 * drop googlenet_2023; add fastGemmBatched * fix step in fastGemmBatched * cpu: fix initialization ofb; gpu: support batch * quick followup fix for cuda * add default kernels * quick followup fix to avoid macro redef * optmized kernels for lasx * resolve mis-alignment; remove comments * tune performance for x64 platform * tune performance for neon aarch64 * tune for armv7 * comment time consuming tests * quick follow-up fix
2023-09-20 05:53:34 +08:00
net.setPreferableBackend(backend_id);
net.setPreferableTarget(target_id);
// warmup
{
std::vector<std::string> input_names(1);
dnn: add gemm_layer in place of fully_connected_layer for onnx models (#23897) * first commit * turned C from input to constant; force C constant in impl; better handling 0d/1d cases * integrate with gemm from ficus nn * fix const inputs * adjust threshold for int8 tryQuantize * adjust threshold for int8 quantized 2 * support batched gemm and matmul; tune threshold for rcnn_ilsvrc13; update googlenet * add gemm perf against innerproduct * add perf tests for innerproduct with bias * fix perf * add memset * renamings for next step * add dedicated perf gemm * add innerproduct in perf_gemm * remove gemm and innerproduct perf tests from perf_layer * add perf cases for vit sizes; prepack constants * remove batched gemm; fix wrong trans; optimize KC * remove prepacking for const A; several fixes for const B prepacking * add todos and gemm expression * add optimized branch for avx/avx2 * trigger build * update macros and signature * update signature * fix macro * fix bugs for neon aarch64 & x64 * add backends: cuda, cann, inf_ngraph and vkcom * fix cuda backend * test commit for cuda * test cuda backend * remove debug message from cuda backend * use cpu dispatcher * fix neon macro undef in dispatcher * fix dispatcher * fix inner kernel for neon aarch64 * fix compiling issue on armv7; try fixing accuracy issue on other platforms * broadcast C with beta multiplied; improve func namings * fix bug for avx and avx2 * put all platform-specific kernels in dispatcher * fix typos * attempt to fix compile issues on x64 * run old gemm when neon, avx, avx2 are all not available; add kernel for armv7 neon * fix typo * quick fix: add macros for pack4 * quick fix: use vmlaq_f32 for armv7 * quick fix for missing macro of fast gemm pack f32 4 * disable conformance tests when optimized branches are not supported * disable perf tests when optimized branches are not supported * decouple cv_try_neon and cv_neon_aarch64 * drop googlenet_2023; add fastGemmBatched * fix step in fastGemmBatched * cpu: fix initialization ofb; gpu: support batch * quick followup fix for cuda * add default kernels * quick followup fix to avoid macro redef * optmized kernels for lasx * resolve mis-alignment; remove comments * tune performance for x64 platform * tune performance for neon aarch64 * tune for armv7 * comment time consuming tests * quick follow-up fix
2023-09-20 05:53:34 +08:00
input_names[0] = "A";
net.setInputsNames(input_names);
net.setInput(A, input_names[0]);
Mat out = net.forward();
}
TEST_CYCLE()
{
Mat res = net.forward();
}
SANITY_CHECK_NOTHING();
}
INSTANTIATE_TEST_CASE_P(/**/, Gemm, Combine(
GemmParamId::all(),
dnnBackendsAndTargets(false, false) // defined in ../test/test_common.hpp
));
} // namespace