diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp index 1d3dd3bc4a..3109955685 100644 --- a/modules/dnn/include/opencv2/dnn/all_layers.hpp +++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp @@ -1160,6 +1160,11 @@ CV__DNN_INLINE_NS_BEGIN static Ptr create(const LayerParams& params); }; + class CV_EXPORTS MatMulLayer : public Layer { + public: + static Ptr create(const LayerParams ¶ms); + }; + class CV_EXPORTS ExpandLayer : public Layer { public: diff --git a/modules/dnn/perf/perf_gemm.cpp b/modules/dnn/perf/perf_gemm.cpp index 8051cc273e..40fd66865b 100644 --- a/modules/dnn/perf/perf_gemm.cpp +++ b/modules/dnn/perf/perf_gemm.cpp @@ -5,6 +5,8 @@ #include "perf_precomp.hpp" #include +#include + namespace opencv_test { struct GemmParam_t { @@ -71,6 +73,18 @@ static const GemmParam_t test_gemm_configs[] = { */ }; +static const GemmParam_t test_matmul_configs[] = { + // vision transformer cases + { {12, 197, 197}, {12, 197, 64} }, + { {12, 197, 64 }, {12, 64, 197} }, + { {12, 50, 64}, {12, 64, 50} }, + { {12, 50, 50}, {12, 50, 64} }, + { {16, 197, 197}, {16, 197, 64} }, + { {16, 197, 64 }, {16, 64, 197} }, + { {16, 50, 64}, {16, 64, 50} }, + { {16, 50, 50}, {16, 50, 64} }, +}; + struct GemmParamId { enum { @@ -88,6 +102,21 @@ struct GemmParamId } }; +struct MatMulParamId { + enum { + MATMUL_0 = 0, + MATMUL_LAST = sizeof(test_matmul_configs) / sizeof(test_matmul_configs[0]) + }; + int val_; + MatMulParamId(int val = 0) : val_(val) {} + operator int() const { return val_; } + static ::testing::internal::ParamGenerator all() { + enum { NUM = (int)MATMUL_LAST }; + MatMulParamId v_[NUM]; for (int i = 0; i < NUM; i++) { v_[i] = MatMulParamId(i); } + return ::testing::ValuesIn(v_, v_ + NUM); + } +}; + static inline void PrintTo(const GemmParamId& v, std::ostream* os) { CV_Assert((int)v >= 0); CV_Assert((int)v < GemmParamId::GEMM_LAST); @@ -138,7 +167,7 @@ PERF_TEST_P_(Gemm, gemm) Mat A(static_cast(a_shape.size()), a_shape.data(), CV_32F); randu(A, -1.0f, 1.0f); Mat B(static_cast(b_shape.size()), b_shape.data(), CV_32F); - randu(A, -1.0f, 1.0f); + randu(B, -1.0f, 1.0f); LayerParams lp; lp.type = "Gemm"; @@ -197,7 +226,7 @@ PERF_TEST_P_(Gemm, innerproduct) Mat A(static_cast(a_shape.size()), a_shape.data(), CV_32F); randu(A, -1.0f, 1.0f); Mat B(static_cast(b_shape.size()), b_shape.data(), CV_32F); - randu(A, -1.0f, 1.0f); + randu(B, -1.0f, 1.0f); LayerParams lp; lp.type = "InnerProduct"; @@ -241,9 +270,146 @@ PERF_TEST_P_(Gemm, innerproduct) SANITY_CHECK_NOTHING(); } +static inline void PrintTo(const MatMulParamId& v, std::ostream* os) +{ + CV_Assert((int)v >= 0); CV_Assert((int)v < MatMulParamId::MATMUL_LAST); + const GemmParam_t& p = test_matmul_configs[(int)v]; + + auto print_shape = [os](const std::vector& shape, const std::string tag) { + if (shape.empty()) { + return ; + } + + *os << tag << "=["; + for (size_t i = 0; i < shape.size(); ++i) { + if (i == shape.size() - 1) { + *os << shape[i] << "]"; + break; + } + *os << shape[i] << ", "; + } + }; + + print_shape(p.a_shape, "A"); + print_shape(p.b_shape, ", B"); + print_shape(p.c_shape, ", C"); + *os << ", trans_a=" << p.trans_a << ", trans_b=" << p.trans_b; +} + +using MatMulTestParam_t = tuple>; +using MatMul = TestBaseWithParam; + +PERF_TEST_P_(MatMul, matmul) +{ + int test_id = (int)get<0>(GetParam()); + ASSERT_GE(test_id, 0); ASSERT_LT(test_id, MatMulParamId::MATMUL_LAST); + const GemmParam_t& params = test_matmul_configs[test_id]; + auto a_shape = params.a_shape; + auto b_shape = params.b_shape; + auto trans_a = params.trans_a; + auto trans_b = params.trans_b; + float alpha = 1.f; + float beta = 1.f; + + Backend backend_id = get<0>(get<1>(GetParam())); + Target target_id = get<1>(get<1>(GetParam())); + + Mat A(a_shape, CV_32F); + randu(A, -1.0f, 1.0f); + Mat B(b_shape, CV_32F); + randu(B, -1.0f, 1.0f); + + LayerParams lp; + lp.type = "MatMul"; + lp.name = "testLayer"; + lp.set("transA", trans_a); + lp.set("transB", trans_b); + lp.set("alpha", alpha); + lp.set("beta", beta); + lp.blobs.push_back(B); + + Net net; + net.addLayerToPrev(lp.name, lp.type, lp); + net.setPreferableBackend(backend_id); + net.setPreferableTarget(target_id); + + // warmup + { + std::vector input_names{"A"}; + net.setInputsNames(input_names); + net.setInput(A, input_names[0]); + Mat out = net.forward(); + } + + TEST_CYCLE() + { + Mat res = net.forward(); + } + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST_P_(MatMul, innerproduct) +{ + int test_id = (int)get<0>(GetParam()); + ASSERT_GE(test_id, 0); ASSERT_LT(test_id, MatMulParamId::MATMUL_LAST); + const GemmParam_t& params = test_matmul_configs[test_id]; + auto a_shape = params.a_shape; + auto b_shape = params.b_shape; + + Backend backend_id = get<0>(get<1>(GetParam())); + Target target_id = get<1>(get<1>(GetParam())); + + Mat A(a_shape, CV_32F); + randu(A, -1.0f, 1.0f); + Mat B(b_shape, CV_32F); + randu(B, -1.0f, 1.0f); + + LayerParams lp; + lp.type = "InnerProduct"; + lp.name = "testLayer"; + lp.set("axis", (int)(a_shape.size() - 1)); + lp.set("bias_term", false); + + // pre-transpose + std::vector order(b_shape.size()); + std::iota(order.begin(), order.end(), 0); + std::swap(order.back(), order[b_shape.size() - 2]); + Mat B_transposed; + transposeND(B, order, B_transposed); + lp.blobs.push_back(B_transposed); + lp.set("num_output", int(B_transposed.total(0, b_shape.size() - 1))); + lp.set("is_matmul", true); + + Net net; + net.addLayerToPrev(lp.name, lp.type, lp); + net.setPreferableBackend(backend_id); + net.setPreferableTarget(target_id); + + // warmup + { + std::vector input_names{"A"}; + net.setInputsNames(input_names); + net.setInput(A, input_names[0]); + Mat out = net.forward(); + } + + TEST_CYCLE() + { + Mat res = net.forward(); + } + + SANITY_CHECK_NOTHING(); +} + INSTANTIATE_TEST_CASE_P(/**/, Gemm, Combine( GemmParamId::all(), dnnBackendsAndTargets(false, false) // defined in ../test/test_common.hpp )); +INSTANTIATE_TEST_CASE_P(/**/, MatMul, Combine( + MatMulParamId::all(), + dnnBackendsAndTargets(false, false) // defined in ../test/test_common.hpp +)); + } // namespace diff --git a/modules/dnn/src/cuda4dnn/csl/cublas.hpp b/modules/dnn/src/cuda4dnn/csl/cublas.hpp index 760e3824fd..96cf70fab9 100644 --- a/modules/dnn/src/cuda4dnn/csl/cublas.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cublas.hpp @@ -8,6 +8,7 @@ #include "error.hpp" #include "stream.hpp" #include "pointer.hpp" +#include "memory.hpp" #include @@ -363,6 +364,145 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu ); } + /** @brief Strided batched GEMM for colummn-major matrices + * + * \f$ C_i = \alpha A_i B_i + \beta C_i \f$ for a stack of matrices A, B and C indexed by i + * + * @tparam T matrix element type (must be `half` or `float`) + * + * @param handle valid cuBLAS Handle + * @param trans_a use transposed matrix of A_i for computation + * @param trans_b use transposed matrix of B_i for computation + * @param M number of rows in C + * @param N number of columns in C + * @param K common dimension of A (or trans A) and B (or trans B) + * @param alpha scale factor for A B + * @param[in] A pointer to stack of column-major matrices A in device memory + * @param lda leading dimension of matrix A + * @param A_offsets offsets to get A slices + * @param[in] B pointer to stack of column-major matrices B in device memory + * @param ldb leading dimension of matrix B + * @param B_offsets offsets to get B slices + * @param beta scale factor for C + * @param[in,out] C pointer to stack of column-major matrices C in device memory + * @param ldc leading dimension of matrix C + * @param C_offsets offsets to get C slices + * @param batchCount number of matrices in the batch + * + * Exception Guarantee: Basic + */ + template + void gemmBatched(const Handle &handle, + bool trans_a, bool trans_b, + std::size_t M, std::size_t N, std::size_t K, + T alpha, + const DevicePtr A, std::size_t lda, std::vector A_offsets, + const DevicePtr B, std::size_t ldb, std::vector B_offsets, + T beta, + const DevicePtr C, std::size_t ldc, std::vector C_offsets, + std::size_t batchCount); + + template <> inline + void gemmBatched(const Handle &handle, + bool trans_a, bool trans_b, + std::size_t M, std::size_t N, std::size_t K, + half alpha, + const DevicePtr A, std::size_t lda, std::vector A_offsets, + const DevicePtr B, std::size_t ldb, std::vector B_offsets, + half beta, + const DevicePtr C, std::size_t ldc, std::vector C_offsets, + std::size_t batchCount) { + CV_Assert(handle); + + const auto opa = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N, + opb = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N; + const auto iM = static_cast(M), + iN = static_cast(N), + iK = static_cast(K), + ilda = static_cast(lda), + ildb = static_cast(ldb), + ildc = static_cast(ldc); + + const auto batch_count = static_cast(batchCount); + + AutoBuffer buffer(3 * batch_count); + auto A_slices = (half**)(buffer.data()); + auto B_slices = A_slices + batch_count; + auto C_slices = B_slices + batch_count; + // collect A, B and C slices + for (int i = 0; i < batch_count; i++) { + A_slices[i] = (half*)(A.get()) + A_offsets[i]; + B_slices[i] = (half*)(B.get()) + B_offsets[i]; + C_slices[i] = (half*)(C.get()) + C_offsets[i]; + } + + const half **dev_A_slices = 0, **dev_B_slices = 0; + half **dev_C_slices = 0; + cudaMalloc((void**)&dev_A_slices, batch_count * sizeof(half*)); + cudaMalloc((void**)&dev_B_slices, batch_count * sizeof(half*)); + cudaMalloc((void**)&dev_C_slices, batch_count * sizeof(half*)); + cudaMemcpy(dev_A_slices, A_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice); + cudaMemcpy(dev_B_slices, B_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice); + cudaMemcpy(dev_C_slices, C_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice); + + CUDA4DNN_CHECK_CUBLAS(cublasHgemmBatched(handle.get(), opa, opb, iM, iN, iK, &alpha, dev_A_slices, ilda, dev_B_slices, ildb, &beta, dev_C_slices, ildc, batch_count)); + + cudaFree(dev_A_slices); + cudaFree(dev_B_slices); + cudaFree(dev_C_slices); + } + + template <> inline + void gemmBatched(const Handle &handle, + bool trans_a, bool trans_b, + std::size_t M, std::size_t N, std::size_t K, + float alpha, + const DevicePtr A, std::size_t lda, std::vector A_offsets, + const DevicePtr B, std::size_t ldb, std::vector B_offsets, + float beta, + const DevicePtr C, std::size_t ldc, std::vector C_offsets, + std::size_t batchCount) { + CV_Assert(handle); + + const auto opa = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N, + opb = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N; + const auto iM = static_cast(M), + iN = static_cast(N), + iK = static_cast(K), + ilda = static_cast(lda), + ildb = static_cast(ldb), + ildc = static_cast(ldc); + + const auto batch_count = static_cast(batchCount); + + AutoBuffer buffer(3 * batch_count); + auto A_slices = (float**)(buffer.data()); + auto B_slices = A_slices + batch_count; + auto C_slices = B_slices + batch_count; + // collect A, B and C slices + for (int i = 0; i < batch_count; i++) { + A_slices[i] = (float*)(A.get()) + A_offsets[i]; + B_slices[i] = (float*)(B.get()) + B_offsets[i]; + C_slices[i] = (float*)(C.get()) + C_offsets[i]; + } + + const float **dev_A_slices = 0, **dev_B_slices = 0; + float **dev_C_slices = 0; + cudaMalloc((void**)&dev_A_slices, batch_count * sizeof(float*)); + cudaMalloc((void**)&dev_B_slices, batch_count * sizeof(float*)); + cudaMalloc((void**)&dev_C_slices, batch_count * sizeof(float*)); + cudaMemcpy(dev_A_slices, A_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice); + cudaMemcpy(dev_B_slices, B_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice); + cudaMemcpy(dev_C_slices, C_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice); + + // cuBLAS is column-major + CUDA4DNN_CHECK_CUBLAS(cublasSgemmBatched(handle.get(), opa, opb, iM, iN, iK, &alpha, dev_A_slices, ilda, dev_B_slices, ildb, &beta, dev_C_slices, ildc, batch_count)); + + cudaFree(dev_A_slices); + cudaFree(dev_B_slices); + cudaFree(dev_C_slices); + } + }}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */ #endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp index 27f8306bf3..868b0c9284 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp @@ -152,6 +152,31 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { batch_size); } + /** @brief performs generalized matrix-multiplication for a strided batch of matrices + * + * Pre-conditions: + * - A, B and C must be rank three tensors with dimensions (batch, rows, cols) + * - the last two axes of \p A and \p B must meet the mathematical requirements for matrix multiplication + * - \p C must be large enough to hold the result and the matrices must not overlap in memory + * + * Exception Guarantee: Basic + */ + template inline + void gemmBatched(const cublas::Handle& handle, std::size_t batch, + T beta, TensorSpan C, const std::vector C_offsets, T alpha, + bool trans_a, TensorView A, const std::vector A_offsets, + bool trans_b, TensorView B, const std::vector B_offsets) { + const auto M = C.get_axis_size(-2), + N = C.get_axis_size(-1), + K = A.get_axis_size(trans_a ? -2 : -1); + const auto lda = A.get_axis_size(-1), + ldb = B.get_axis_size(-1), + ldc = N; + + // collect pointers and run cublasSgemmBatched / cublasHgemmBatched + csl::cublas::gemmBatched(handle, trans_b, trans_a, N, M, K, 1.f, B.get(), ldb, B_offsets, A.get(), lda, A_offsets, 0.f, C.get(), ldc, C_offsets, batch); + } + /** @brief performs element-wise addition with broadcasting * * Pre-conditions: diff --git a/modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp b/modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp new file mode 100644 index 0000000000..824d917382 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp @@ -0,0 +1,79 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MATMUL_BROADCAST_HPP +#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MATMUL_BROADCAST_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/cublas.hpp" +#include "../csl/tensor.hpp" +#include "../csl/tensor_ops.hpp" + +#include + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + template + class MatMulBroadcastOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + MatMulBroadcastOp(csl::Stream stream_, csl::cublas::Handle handle, const Mat &B, bool _transA, bool _transB, + const std::vector &A_offsets_, const std::vector &B_offsets_, std::vector &C_offsets_, + size_t batch_) + : stream(std::move(stream_)), cublasHandle(std::move(handle)), A_offsets(A_offsets_), B_offsets(B_offsets_), C_offsets(C_offsets_), batch(batch_) + { + if (!B.empty()) { + input_B_tensor = csl::makeTensorHeader(B); + csl::copyMatToTensor(B, input_B_tensor, stream); + } + + transA = _transA; + transB = _transB; + } + + void forward( + const std::vector>& inputs, + const std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(((inputs.size() == 2 && input_B_tensor.empty()) || + (inputs.size() == 1 && !input_B_tensor.empty())) && outputs.size() == 1); + + auto input_A_wrapper = inputs[0].dynamicCast(); + auto input_A = input_A_wrapper->getView(); + + csl::TensorView input_B; + if (input_B_tensor.empty()) { + auto input_B_wrapper = inputs[1].dynamicCast(); + input_B = input_B_wrapper->getView(); + } else { + input_B = csl::TensorView(input_B_tensor); + } + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::gemmBatched(cublasHandle, batch, 0.f, output, C_offsets, 1.f, transA, input_A, A_offsets, transB, input_B, B_offsets); + } + + private: + csl::Stream stream; + csl::cublas::Handle cublasHandle; + csl::Tensor input_B_tensor; + bool transA, transB; + + std::vector A_offsets; + std::vector B_offsets; + std::vector C_offsets; + size_t batch; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MATMUL_BROADCAST_HPP */ diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp index 961e6e5c9a..cc316efbfc 100644 --- a/modules/dnn/src/init.cpp +++ b/modules/dnn/src/init.cpp @@ -102,6 +102,7 @@ void initializeLayerFactory() CV_DNN_REGISTER_LAYER_CLASS(LRN, LRNLayer); CV_DNN_REGISTER_LAYER_CLASS(InnerProduct, InnerProductLayer); CV_DNN_REGISTER_LAYER_CLASS(Gemm, GemmLayer); + CV_DNN_REGISTER_LAYER_CLASS(MatMul, MatMulLayer); CV_DNN_REGISTER_LAYER_CLASS(Softmax, SoftmaxLayer); CV_DNN_REGISTER_LAYER_CLASS(SoftMax, SoftmaxLayer); // For compatibility. See https://github.com/opencv/opencv/issues/16877 CV_DNN_REGISTER_LAYER_CLASS(MVN, MVNLayer); diff --git a/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp b/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp index b7aa18d486..ef71d8b10c 100644 --- a/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp +++ b/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp @@ -21,48 +21,76 @@ namespace cv { namespace dnn { void fastGemmPackB(const Mat &B, std::vector &packed_B, bool trans, FastGemmOpt &opt) { - CV_CheckEQ(B.dims, 2, "fastGemmPackB: input mat should be two-dimensional"); CV_CheckTypeEQ(B.type(), CV_32F, "fastGemmPackB: only float32 is supported for now"); auto B_shape = shape(B); - int K = B_shape[0], N = B_shape[1], ldb0 = N, ldb1 = 1; + int batch = total(B_shape, 0, B_shape.size() - 2), + K = B_shape[B_shape.size() - 2], N = B_shape.back(), ldb0 = N, ldb1 = 1; if (trans) { std::swap(K, N); std::swap(ldb0, ldb1); } + const auto *b = B.ptr(); + int esz = B.elemSize(); + #if CV_TRY_NEON if (opt.use_neon) { int size_packed_B = opt_NEON::fastGemmPackBSize(N, K); - packed_B.resize(size_packed_B); - opt_NEON::fastGemmPackBKernel(B.ptr(), (char *)packed_B.data(), N, K, ldb0, ldb1, B.elemSize()); + packed_B.resize(size_packed_B * batch); + auto *packed_b = (char*)packed_B.data(); + for (int i = 0; i < batch; i++) { + opt_NEON::fastGemmPackBKernel(b, packed_b, N, K, ldb0, ldb1, esz); + b += N * K * esz; + packed_b += size_packed_B * esz; + } } else #endif #if CV_TRY_AVX2 if (opt.use_avx2) { int size_packed_B = opt_AVX2::fastGemmPackBSize(N, K); - packed_B.resize(size_packed_B); - opt_AVX2::fastGemmPackBKernel(B.ptr(), (char *)packed_B.data(), N, K, ldb0, ldb1, B.elemSize()); + packed_B.resize(size_packed_B * batch); + auto *packed_b = (char*)packed_B.data(); + for (int i = 0; i < batch; i++) { + opt_AVX2::fastGemmPackBKernel(b, packed_b, N, K, ldb0, ldb1, esz); + b += N * K * esz; + packed_b += size_packed_B * esz; + } } else #endif #if CV_TRY_AVX if (opt.use_avx) { int size_packed_B = opt_AVX::fastGemmPackBSize(N, K); - packed_B.resize(size_packed_B); - opt_AVX::fastGemmPackBKernel(B.ptr(), (char *)packed_B.data(), N, K, ldb0, ldb1, B.elemSize()); + packed_B.resize(size_packed_B * batch); + auto *packed_b = (char*)packed_B.data(); + for (int i = 0; i < batch; i++) { + opt_AVX::fastGemmPackBKernel(b, packed_b, N, K, ldb0, ldb1, esz); + b += N * K * esz; + packed_b += size_packed_B * esz; + } } else #endif #if CV_TRY_LASX if (opt.use_lasx) { int size_packed_B = opt_LASX::fastGemmPackBSize(N, K); - packed_B.resize(size_packed_B); - opt_LASX::fastGemmPackBKernel(B.ptr(), (char *)packed_B.data(), N, K, ldb0, ldb1, B.elemSize()); + packed_B.resize(size_packed_B * batch); + auto *packed_b = (char*)packed_B.data(); + for (int i = 0; i < batch; i++) { + opt_LASX::fastGemmPackBKernel(b, packed_b, N, K, ldb0, ldb1, esz); + b += N * K * esz; + packed_b += size_packed_B * esz; + } } else #endif { int size_packed_B = cpu_baseline::fastGemmPackBSize(N, K); - packed_B.resize(size_packed_B); - cpu_baseline::fastGemmPackBKernel(B.ptr(), (char *)packed_B.data(), N, K, ldb0, ldb1, B.elemSize()); + packed_B.resize(size_packed_B * batch); + auto *packed_b = (char*)packed_B.data(); + for (int i = 0; i < batch; i++) { + cpu_baseline::fastGemmPackBKernel(b, packed_b, N, K, ldb0, ldb1, esz); + b += N * K * esz; + packed_b += size_packed_B * esz; + } } } @@ -131,7 +159,6 @@ void fastGemm(bool trans_a, int M, int N, int K, void fastGemm(bool trans_a, bool trans_b, int ma, int na, int mb, int nb, float alpha, const float *A, int lda0, int lda1, const float *B, int ldb0, int ldb1, float beta, float *C, int ldc, FastGemmOpt &opt) { - const char *a = (const char *)A; const char *b = (const char *)B; char *c = (char *)C; @@ -209,54 +236,93 @@ void fastGemm(bool trans_a, bool trans_b, beta, c, ldc, opt); } -void fastGemmBatched(bool trans_a, bool trans_b, - float alpha, const Mat &A, const Mat &B, - float beta, Mat &C, FastGemmOpt &opt) { - CV_CheckTypeEQ(A.type(), B.type(), "DNN/fastGemmBatched: A and B should have the same type"); - CV_CheckTypeEQ(B.type(), C.type(), "DNN/fastGemmBatched: B and C should have the same type"); - CV_CheckTypeEQ(A.type(), CV_32F, "DNN/fastGemmBatched: only support float32 for now"); +void fastGemmBatch(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets, + int M, int N, int K, float alpha, const float *A, int lda0, int lda1, + const float *B, int ldb0, int ldb1, float beta, float *C, int ldc, FastGemmOpt &opt) { + const char *a = (const char *)A; + const char *b = (const char *)B; + char *c = (char *)C; - const auto shape_a = shape(A); - size_t dims_A = shape_a.size(); - CV_CheckGE(dims_A, static_cast(2), "DNN/fastGemmBatched: A must be n-dimensional (n >= 2)"); - const auto shape_b = shape(B); - CV_CheckEQ(shape_b.size(), static_cast(2), "DNN/fastGemmBatched: B must be 2-dimensional"); - const auto shape_c = shape(C); - size_t dims_C = shape_c.size(); - CV_CheckGE(dims_C, static_cast(2), "DNN/fastGemmBatched: C must be n-dimensional (n >= 2)"); - - if (trans_a) { - int ma = shape_a[dims_A - 2], na = shape_a[dims_A - 1]; - int mb = shape_b[0], nb = shape_b[1]; - - int lda0 = na, lda1 = 1, ldb0 = nb, ldb1 = 1, ldc = shape_c[1]; - - const float *a = A.ptr(); - const float *b = B.ptr(); - float *c = C.ptr(); - - int batches = std::accumulate(shape_a.begin(), shape_a.end() - 2, 1, std::multiplies()); - int step_a = ma * na, step_c = na * nb; - for (int i = 0; i < batches; i++) { - fastGemm(true, trans_b, ma, na, mb, nb, - alpha, a + i * step_a, lda0, lda1, b, ldb0, ldb1, - beta, c + i * step_c, ldc, opt); - } - } else { - int ma = std::accumulate(shape_a.begin(), shape_a.end() - 1, 1, std::multiplies()), - na = shape_a[dims_A - 1]; - int mb = shape_b[0], nb = shape_b[1]; - - int lda0 = na, lda1 = 1, ldb0 = nb, ldb1 = 1, ldc = shape_c[1]; - - const float *a = A.ptr(); - const float *b = B.ptr(); - float *c = C.ptr(); - - fastGemm(false, trans_b, ma, na, mb, nb, - alpha, a, lda0, lda1, b, ldb0, ldb1, - beta, c, ldc, opt); +#if CV_TRY_NEON + if (opt.use_neon) { + opt_NEON::fastGemmBatchKernel(batch, A_offsets, B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, ldb0, ldb1, beta, c, ldc, sizeof(float)); + } else +#endif +#if CV_TRY_AVX2 + if (opt.use_avx2) { + opt_AVX2::fastGemmBatchKernel(batch, A_offsets, B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, ldb0, ldb1, beta, c, ldc, sizeof(float)); + } else +#endif +#if CV_TRY_AVX + if (opt.use_avx) { + opt_AVX::fastGemmBatchKernel(batch, A_offsets, B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, ldb0, ldb1, beta, c, ldc, sizeof(float)); + } else +#endif +#if CV_TRY_LASX + if (opt.use_lasx) { + opt_LASX::fastGemmBatchKernel(batch, A_offsets, B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, ldb0, ldb1, beta, c, ldc, sizeof(float)); + } else +#endif + { + cpu_baseline::fastGemmBatchKernel(batch, A_offsets, B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, ldb0, ldb1, beta, c, ldc, sizeof(float)); } } +void fastGemmBatch(size_t batch, const size_t *A_offsets, const size_t *packed_B_offsets, const size_t *C_offsets, + int M, int N, int K, float alpha, const float *A, int lda0, int lda1, + const float *packed_B, float beta, float *C, int ldc, FastGemmOpt &opt) { + const char *a = (const char *)A; + const char *b = (const char *)packed_B; + char *c = (char *)C; + +#if CV_TRY_NEON + if (opt.use_neon) { + opt_NEON::fastGemmBatchKernel(batch, A_offsets, packed_B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, beta, c, ldc, sizeof(float)); + } else +#endif +#if CV_TRY_AVX2 + if (opt.use_avx2) { + opt_AVX2::fastGemmBatchKernel(batch, A_offsets, packed_B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, beta, c, ldc, sizeof(float)); + } else +#endif +#if CV_TRY_AVX + if (opt.use_avx) { + opt_AVX::fastGemmBatchKernel(batch, A_offsets, packed_B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, beta, c, ldc, sizeof(float)); + } else +#endif +#if CV_TRY_LASX + if (opt.use_lasx) { + opt_LASX::fastGemmBatchKernel(batch, A_offsets, packed_B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, beta, c, ldc, sizeof(float)); + } else +#endif + { + cpu_baseline::fastGemmBatchKernel(batch, A_offsets, packed_B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, beta, c, ldc, sizeof(float)); + } +} + +void fastGemmBatch(bool trans_a, bool trans_b, + float alpha, const Mat &A, const Mat &B, + float beta, Mat &C, FastGemmOpt &opt) { + CV_CheckTypeEQ(A.type(), B.type(), "DNN/fastGemmBatch: A and B should have the same type"); + CV_CheckTypeEQ(B.type(), C.type(), "DNN/fastGemmBatch: B and C should have the same type"); + CV_CheckTypeEQ(A.type(), CV_32F, "DNN/fastGemmBatch: only support float32 for now"); + + const auto shape_a = shape(A); + const auto shape_b = shape(B); + const auto shape_c = shape(C); + CV_CheckGE(shape_a.size(), static_cast(2), "DNN/fastGemmBatch: A must be n-dimensional (n >= 2)"); + CV_CheckEQ(shape_b.size(), static_cast(2), "DNN/fastGemmBatch: B must be n-dimensional (n >= 2)"); + + const float *a = A.ptr(); + const float *b = B.ptr(); + float *c = C.ptr(); + + MatMulHelper helper; + helper.compute(trans_a, trans_b, shape_a, shape_b, shape_c); + + fastGemmBatch(helper.batch, helper.A_offsets.data(), helper.B_offsets.data(), helper.C_offsets.data(), + helper.M, helper.N, helper.K, alpha, a, helper.lda0, helper.lda1, b, helper.ldb0, + helper.ldb1, beta, c, helper.ldc, opt); +} + }} // cv::dnn diff --git a/modules/dnn/src/layers/cpu_kernels/fast_gemm.hpp b/modules/dnn/src/layers/cpu_kernels/fast_gemm.hpp index 7f9e5c3017..9060068080 100644 --- a/modules/dnn/src/layers/cpu_kernels/fast_gemm.hpp +++ b/modules/dnn/src/layers/cpu_kernels/fast_gemm.hpp @@ -42,6 +42,112 @@ struct FastGemmOpt { } }; +struct MatMulHelper { + std::vector A_offsets; + std::vector B_offsets; + std::vector packed_B_offsets; + std::vector C_offsets; + std::vector A_rows; + std::vector B_rows; + std::vector C_rows; + size_t batch; + + int lda0, lda1; + int ldb0, ldb1; + int ldc; + + int M, N, K; + + MatMulHelper() { + A_offsets = {0}; + B_offsets = {0}; + packed_B_offsets = {0}; + C_offsets = {0}; + A_rows = {0}; + B_rows = {0}; + C_rows = {0}; + + batch = 0; + } + + bool empty() const { + return batch == 0; + } + + void compute(bool trans_a, bool trans_b, MatShape A_shape, MatShape B_shape, MatShape C_shape) { + auto A_ndims = A_shape.size(), B_ndims = B_shape.size(), C_ndims = C_shape.size(); + int ma = A_shape[A_ndims - 2], na = A_shape.back(); + int mb = B_shape[B_ndims - 2], nb = B_shape.back(); + lda0 = na, lda1 = 1; + ldb0 = nb, ldb1 = 1; + ldc = C_shape.back(); + + M = trans_a ? na : ma; + N = trans_b ? mb : nb; + K = trans_a ? ma : na; + + if (trans_a) { + std::swap(lda0, lda1); + } + if (trans_b) { + std::swap(ldb0, ldb1); + } + + // compute offsets + auto batch_ndims = C_ndims - 2; + + batch = total(C_shape, 0, batch_ndims); + + A_offsets.resize(batch, 0); + B_offsets.resize(batch, 0); + C_offsets.resize(batch, 0); + A_rows.resize(batch, 0); + B_rows.resize(batch, 0); + C_rows.resize(batch, 0); + + // build C_offsets + size_t C_step = total(C_shape, C_ndims - 2); + + MatShape A_broadcast_shape(C_ndims, 1); + std::memcpy(A_broadcast_shape.data() + (C_ndims - A_ndims), A_shape.data(), A_ndims * sizeof(int)); + MatShape B_broadcast_shape(C_shape.size(), 1); + std::memcpy(B_broadcast_shape.data() + (C_ndims - B_ndims), B_shape.data(), B_shape.size() * sizeof(int)); + std::vector A_steps(C_ndims, 1), B_steps(C_ndims, 1); + for (int i = C_ndims - 2; i >= 0; i--) { + A_steps[i] = A_steps[i + 1] * A_broadcast_shape[i + 1]; + B_steps[i] = B_steps[i + 1] * B_broadcast_shape[i + 1]; + } + size_t t, idx; + for (size_t i = 0; i < batch; i++) { + C_offsets[i] = i * C_step; + C_rows[i] = i; + + size_t A_offset = 0, B_offset = 0; + t = i; + for (int j = batch_ndims - 1; j >= 0; j--) { + idx = t / C_shape[j]; + int idx_offset = (int)(t - idx * C_shape[j]); + A_offset += A_broadcast_shape[j] == 1 ? 0 : idx_offset * A_steps[j]; + B_offset += B_broadcast_shape[j] == 1 ? 0 : idx_offset * B_steps[j]; + t = idx; + } + A_offsets[i] = A_offset; + B_offsets[i] = B_offset; + A_rows[i] = A_offset / (M * K); + B_rows[i] = B_offset / (N * K); + } + } + + // only run after compute + void updatePackedBOffsets(size_t packed_B_size) { + size_t packed_B_inner_size = packed_B_size / batch; + packed_B_offsets.resize(B_offsets.size()); + for (size_t i = 0; i < packed_B_offsets.size(); i++) { + packed_B_offsets[i] = (B_offsets[i] / (N * K)) * packed_B_inner_size; + } + } +}; + void fastGemmPackB(const Mat &m, std::vector &packed_B, bool trans, FastGemmOpt &opt); void fastGemm(bool trans_a, int M, int N, int K, @@ -55,10 +161,14 @@ void fastGemm(bool trans_a, bool trans_b, float alpha, const Mat &A, const Mat &B, float beta, Mat &C, FastGemmOpt &opt); -// FIXME: B needs to 2d for now. Support nd (n>=2) B in the future. -void fastGemmBatched(bool trans_a, bool trans_b, - float alpha, const Mat &A, const Mat &B, - float beta, Mat &C, FastGemmOpt &opt); +void fastGemmBatch(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets, + int M, int N, int K, float alpha, const float *A, int lda0, int lda1, + const float *B, int ldb0, int ldb1, float beta, float *C, int ldc, FastGemmOpt &opt); +void fastGemmBatch(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets, + int M, int N, int K, float alpha, const float *A, int lda0, int lda1, + const float *packed_B, float beta, float *C, int ldc, FastGemmOpt &opt); +void fastGemmBatch(bool trans_a, bool trans_b, float alpha, const Mat &A, + const Mat &B, float beta, Mat &C, FastGemmOpt &opt); }} // cv::dnn diff --git a/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.default.hpp b/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.default.hpp index b9362bb4d5..e985fc46ee 100644 --- a/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.default.hpp +++ b/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.default.hpp @@ -88,6 +88,13 @@ void fastGemmKernel(int M, int N, int K, float alpha, const char *A, int lda0, int lda1, const char *packed_B, float beta, char *C, int ldc, int esz); +void fastGemmBatchKernel(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets, + int M, int N, int K, float alpha, const char *A, int lda0, int lda1, + const char *B, int ldb0, int ldb1, float beta, char *C, int ldc, int esz); +void fastGemmBatchKernel(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets, + int M, int N, int K, float alpha, const char *A, int lda0, int lda1, + const char *packed_B, float beta, char *C, int ldc, int esz); + FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float) FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) @@ -300,6 +307,153 @@ void fastGemmKernel(int M, int N, int K, parallel_for_(Range(0, total), fn, nstripes); } +void fastGemmBatchKernel(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets, + int M, int N, int K, float alpha, const char *A, int lda0, int lda1, + const char *B, int ldb0, int ldb1, float beta, char *C, int ldc, int esz) { + int GEMM_MC = FAST_GEMM_F32_MC, + GEMM_NC = FAST_GEMM_F32_NC, + GEMM_MR = FAST_GEMM_F32_MR, + GEMM_NR = FAST_GEMM_F32_NR; + + int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR; + int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR; + int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K); + + size_t buff_size = KC * (MC + NC) * esz; + bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF; + int m_tiles = (M + MC - 1) / MC; + int n_tiles = (N + NC - 1) / NC; + int total_tiles = m_tiles * n_tiles; + + auto fn = [&](const Range &r) { + char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size)); + char* packed_b = packed_a + KC * MC * esz; + int start = r.start; + int end = r.end; + + for (int tile_idx = start; tile_idx < end; tile_idx++) { + const int batch_index = static_cast(tile_idx / total_tiles); + const int m_tiles_index = static_cast((tile_idx - batch_index * total_tiles) / n_tiles); + const int n_tiles_index = static_cast(tile_idx % n_tiles); + + int i0 = m_tiles_index * MC; + int j0 = n_tiles_index * NC; + int mc = M - i0 < MC ? M - i0 : MC; + int nc = N - j0 < NC ? N - j0 : NC; + int ldc_block = ldc; + const char *a_block = A + A_offsets[batch_index] * esz; + const char *b_block = B + B_offsets[batch_index] * esz; + char* c_block = C + C_offsets[batch_index] * esz + (i0 * ldc + j0) * esz; + + if (beta == 0.f) { + for(int i = 0; i < mc; i++) + memset(c_block + i * ldc_block * esz, 0, nc * esz); + } else if (beta != 1.f) { + for(int i = 0; i < mc; i++) { + float* c_i = (float*)c_block + i * ldc_block; + for(int j = 0; j < nc; j++) + c_i[j] *= beta; + } + } + + for(int k0 = 0; k0 < K; k0 += KC) + { + int kc = K - k0 < KC ? K - k0 : KC; + // pack a + fast_gemm_pack8_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a); + + // pack b + fast_gemm_pack12_f32(nc, kc, b_block + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b); + + // run kernel + fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz); + } + } + + if (!use_stackbuff) { + free(packed_a); + } + }; + + int total = batch * total_tiles; + int cost_per_thread = static_cast((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR)); + double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0); + parallel_for_(Range(0, total), fn, nstripes); +} + +void fastGemmBatchKernel(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets, + int M, int N, int K, float alpha, const char *A, int lda0, int lda1, + const char *packed_B, float beta, char *C, int ldc, int esz) { + int GEMM_MC = FAST_GEMM_F32_MC, + GEMM_NC = FAST_GEMM_F32_NC, + GEMM_MR = FAST_GEMM_F32_MR, + GEMM_NR = FAST_GEMM_F32_NR; + + int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR; + int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR; + int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K); + + size_t buff_size = KC * MC * esz; + bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF; + int m_tiles = (M + MC - 1) / MC; + int n_tiles = (N + NC - 1) / NC; + int total_tiles = m_tiles * n_tiles; + + auto fn = [&](const Range &r) { + char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size)); + const char *packed_b = packed_B; + int start = r.start; + int end = r.end; + + for (int tile_idx = start; tile_idx < end; tile_idx++) { + const int batch_index = static_cast(tile_idx / total_tiles); + const int m_tiles_index = static_cast((tile_idx - batch_index * total_tiles) / n_tiles); + const int n_tiles_index = static_cast(tile_idx % n_tiles); + + int i0 = m_tiles_index * MC; + int j0 = n_tiles_index * NC; + int mc = M - i0 < MC ? M - i0 : MC; + int nc = N - j0 < NC ? N - j0 : NC; + int ldc_block = ldc; + const char *a_block = A + A_offsets[batch_index] * esz; + packed_b = packed_B + B_offsets[batch_index] * esz + j0 * K * esz; + char* c_block = C + C_offsets[batch_index] * esz + (i0 * ldc + j0) * esz; + + if (beta == 0.f) { + for(int i = 0; i < mc; i++) + memset(c_block + i * ldc_block * esz, 0, nc * esz); + } else if (beta != 1.f) { + for(int i = 0; i < mc; i++) { + float* c_i = (float*)c_block + i * ldc_block; + for(int j = 0; j < nc; j++) + c_i[j] *= beta; + } + } + + int _nc = static_cast((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz; + for(int k0 = 0; k0 < K; k0 += KC) + { + int kc = K - k0 < KC ? K - k0 : KC; + // pack a + fast_gemm_pack8_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a); + + // run kernel + fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz); + packed_b += _nc * kc; + } + } + + if (!use_stackbuff) { + free(packed_a); + } + }; + + int total = batch * total_tiles; + int cost_per_thread = static_cast((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR)); + double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0); + parallel_for_(Range(0, total), fn, nstripes); +} + }}} // cv::dnn::cpu_baseline #undef FAST_GEMM_STORAGE diff --git a/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.simd.hpp b/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.simd.hpp index 7d123ed9b5..74677f73ed 100644 --- a/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.simd.hpp +++ b/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.simd.hpp @@ -22,8 +22,8 @@ #define FAST_GEMM_F32_MC 48 #define FAST_GEMM_F32_NC 128 #else // CV_NEON_AARCH64, SIMD128 -#define FAST_GEMM_F32_MC 64 -#define FAST_GEMM_F32_NC 240 +#define FAST_GEMM_F32_MC 144 +#define FAST_GEMM_F32_NC 72 #endif #if CV_AVX @@ -127,6 +127,13 @@ void fastGemmKernel(int M, int N, int K, float alpha, const char *A, int lda0, int lda1, const char *packed_B, float beta, char *C, int ldc, int esz); +void fastGemmBatchKernel(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets, + int M, int N, int K, float alpha, const char *A, int lda0, int lda1, + const char *B, int ldb0, int ldb1, float beta, char *C, int ldc, int esz); +void fastGemmBatchKernel(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets, + int M, int N, int K, float alpha, const char *A, int lda0, int lda1, + const char *packed_B, float beta, char *C, int ldc, int esz); + #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY /* @@ -721,6 +728,177 @@ void fastGemmKernel(int M, int N, int K, parallel_for_(Range(0, total), fn, nstripes); } +void fastGemmBatchKernel(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets, + int M, int N, int K, float alpha, const char *A, int lda0, int lda1, + const char *B, int ldb0, int ldb1, float beta, char *C, int ldc, int esz) { + int GEMM_MC = FAST_GEMM_F32_MC, + GEMM_NC = FAST_GEMM_F32_NC, + GEMM_MR = FAST_GEMM_F32_MR, + GEMM_NR = FAST_GEMM_F32_NR; + + int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR; + int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR; + int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K); + + size_t buff_size = KC * (MC + NC) * esz; + bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF; + int m_tiles = (M + MC - 1) / MC; + int n_tiles = (N + NC - 1) / NC; + int total_tiles = m_tiles * n_tiles; + + auto fn = [&](const Range &r) { + char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size)); + char* packed_b = packed_a + KC * MC * esz; + int start = r.start; + int end = r.end; + + for (int tile_idx = start; tile_idx < end; tile_idx++) { + const int batch_index = static_cast(tile_idx / total_tiles); + const int m_tiles_index = static_cast((tile_idx - batch_index * total_tiles) / n_tiles); + const int n_tiles_index = static_cast(tile_idx % n_tiles); + + int i0 = m_tiles_index * MC; + int j0 = n_tiles_index * NC; + int mc = M - i0 < MC ? M - i0 : MC; + int nc = N - j0 < NC ? N - j0 : NC; + int ldc_block = ldc; + const char *a_block = A + A_offsets[batch_index] * esz; + const char *b_block = B + B_offsets[batch_index] * esz; + char* c_block = C + C_offsets[batch_index] * esz + (i0 * ldc + j0) * esz; + + if (beta == 0.f) { + for(int i = 0; i < mc; i++) + memset(c_block + i * ldc_block * esz, 0, nc * esz); + } else if (beta != 1.f) { + for(int i = 0; i < mc; i++) { + float* c_i = (float*)c_block + i * ldc_block; + for(int j = 0; j < nc; j++) + c_i[j] *= beta; + } + } + + for(int k0 = 0; k0 < K; k0 += KC) + { + int kc = K - k0 < KC ? K - k0 : KC; + // pack a +#if CV_NEON && CV_NEON_AARCH64 + fast_gemm_pack8_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a); +#elif CV_AVX + fast_gemm_pack12_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a); +#elif CV_LASX + fast_gemm_pack12_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a); +#elif CV_SIMD128 + fast_gemm_pack8_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a); +#endif + + // pack b +#if CV_NEON && CV_NEON_AARCH64 + fast_gemm_pack12_f32(nc, kc, b_block + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b); +#elif CV_AVX + fast_gemm_pack8_f32(nc, kc, b_block + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b); +#elif CV_LASX + fast_gemm_pack16_f32(nc, kc, b_block + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b); +#elif CV_SIMD128 + fast_gemm_pack12_f32(nc, kc, b_block + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b); +#endif + + // run kernel + fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz); + } + } + + if (!use_stackbuff) { + free(packed_a); + } + }; + + int total = batch * total_tiles; + int cost_per_thread = static_cast((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR)); + double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0); + parallel_for_(Range(0, total), fn, nstripes); +} + +void fastGemmBatchKernel(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets, + int M, int N, int K, float alpha, const char *A, int lda0, int lda1, + const char *packed_B, float beta, char *C, int ldc, int esz) { + int GEMM_MC = FAST_GEMM_F32_MC, + GEMM_NC = FAST_GEMM_F32_NC, + GEMM_MR = FAST_GEMM_F32_MR, + GEMM_NR = FAST_GEMM_F32_NR; + + int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR; + int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR; + int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K); + + size_t buff_size = KC * MC * esz; + bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF; + int m_tiles = (M + MC - 1) / MC; + int n_tiles = (N + NC - 1) / NC; + int total_tiles = m_tiles * n_tiles; + + auto fn = [&](const Range &r) { + char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size)); + const char *packed_b = packed_B; + int start = r.start; + int end = r.end; + + for (int tile_idx = start; tile_idx < end; tile_idx++) { + const int batch_index = static_cast(tile_idx / total_tiles); + const int m_tiles_index = static_cast((tile_idx - batch_index * total_tiles) / n_tiles); + const int n_tiles_index = static_cast(tile_idx % n_tiles); + + int i0 = m_tiles_index * MC; + int j0 = n_tiles_index * NC; + int mc = M - i0 < MC ? M - i0 : MC; + int nc = N - j0 < NC ? N - j0 : NC; + int ldc_block = ldc; + const char *a_block = A + A_offsets[batch_index] * esz; + packed_b = packed_B + B_offsets[batch_index] * esz + j0 * K * esz; + char* c_block = C + C_offsets[batch_index] * esz + (i0 * ldc + j0) * esz; + + if (beta == 0.f) { + for(int i = 0; i < mc; i++) + memset(c_block + i * ldc_block * esz, 0, nc * esz); + } else if (beta != 1.f) { + for(int i = 0; i < mc; i++) { + float* c_i = (float*)c_block + i * ldc_block; + for(int j = 0; j < nc; j++) + c_i[j] *= beta; + } + } + + int _nc = static_cast((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz; + for(int k0 = 0; k0 < K; k0 += KC) + { + int kc = K - k0 < KC ? K - k0 : KC; + // pack a +#if CV_NEON && CV_NEON_AARCH64 + fast_gemm_pack8_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a); +#elif CV_AVX + fast_gemm_pack12_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a); +#elif CV_LASX + fast_gemm_pack12_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a); +#elif CV_SIMD128 + fast_gemm_pack8_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a); +#endif + + // run kernel + fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz); + packed_b += _nc * kc; + } + } + + if (!use_stackbuff) { + free(packed_a); + } + }; + + int total = batch * total_tiles; + int cost_per_thread = static_cast((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR)); + double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0); + parallel_for_(Range(0, total), fn, nstripes); +} + #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY CV_CPU_OPTIMIZATION_NAMESPACE_END diff --git a/modules/dnn/src/layers/gemm_layer.cpp b/modules/dnn/src/layers/gemm_layer.cpp index 8bcec78343..821700c83e 100644 --- a/modules/dnn/src/layers/gemm_layer.cpp +++ b/modules/dnn/src/layers/gemm_layer.cpp @@ -211,7 +211,7 @@ public: CV_CheckGT(packed_B.size(), static_cast(0), "DNN/Gemm: constant B is not pre-packed"); fastGemm(trans_a, M, N, K, alpha, A.ptr(), na, packed_B.data(), 1.f, Y.ptr(), N, opt); } else { - fastGemmBatched(trans_a, trans_b, alpha, A, inputs[1], 1.f, Y, opt); + fastGemmBatch(trans_a, trans_b, alpha, A, inputs[1], 1.f, Y, opt); } } diff --git a/modules/dnn/src/layers/matmul_layer.cpp b/modules/dnn/src/layers/matmul_layer.cpp new file mode 100644 index 0000000000..c6cea65d87 --- /dev/null +++ b/modules/dnn/src/layers/matmul_layer.cpp @@ -0,0 +1,326 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "../precomp.hpp" + +#include +#include "cpu_kernels/fast_gemm.hpp" + +// OpenVINO backend +#include "../op_inf_engine.hpp" +#include "../ie_ngraph.hpp" + +// Vulkan backend +#include "../op_vkcom.hpp" + +// CUDA backend +#ifdef HAVE_CUDA +#include "../cuda4dnn/primitives/matmul_broadcast.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + +// CANN backend +#include "../op_cann.hpp" + +namespace cv { namespace dnn { + +class MatMulLayerImpl CV_FINAL : public MatMulLayer { + public: + MatMulLayerImpl(const LayerParams& params) { + setParamsFrom(params); + + trans_a = params.get("transA", false); + trans_b = params.get("transB", false); + alpha = params.get("alpha", 1.f); + beta = params.get("beta", 1.f); + } + + virtual bool supportBackend(int backendId) CV_OVERRIDE { + return backendId == DNN_BACKEND_OPENCV || + backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH || + (backendId == DNN_BACKEND_VKCOM && haveVulkan() && !trans_a && !trans_b) || + backendId == DNN_BACKEND_CUDA || + backendId == DNN_BACKEND_CANN; + } + + virtual bool getMemoryShapes(const std::vector &inputs, + const int requiredOutputs, + std::vector &outputs, + std::vector &internals) const CV_OVERRIDE { + CV_CheckGE(inputs.size(), static_cast(1), "DNN/MatMul: one varible input at least"); + CV_CheckLE(inputs.size(), static_cast(2), "DNN/MatMul: two variable inputs at most"); + + const auto shape_A = inputs[0], shape_B = blobs.empty() ? inputs[1] : shape(blobs[0]); + CV_CheckGE(shape_A.size(), static_cast(2), "DNN/MatMul: invalid shape of input A"); + CV_CheckGE(shape_B.size(), static_cast(2), "DNN/MatMul: invalid shape of input B"); + + // Check legal matrix multiplication + int mA = shape_A[shape_A.size() - 2], nA = shape_A.back(); + int mB = shape_B[shape_B.size() - 2], nB = shape_B.back(); + int M = trans_a ? nA : mA; + int N = trans_b ? mB : nB; + int K_A = trans_a ? mA : nA; + int K_B = trans_b ? nB : mB; + CV_CheckEQ(K_A, K_B, "DNN/MatMul: invalid dimension K"); + + // Check legal broadcast. It is legal for sure if A and B are 2d, or one of them is 2d. + MatShape common_shape; + if (shape_A.size() != 2 || shape_B.size() != 2) { + const auto &shape_more_dims = shape_A.size() > shape_B.size() ? shape_A : shape_B; + const auto &shape_less_dims = shape_A.size() > shape_B.size() ? shape_B : shape_A; + size_t diff_dims = shape_more_dims.size() - shape_less_dims.size(); + common_shape = shape_more_dims; + for (size_t i = 0; i < shape_less_dims.size() - 2; i++) { + const auto dl = shape_less_dims[i], dm = shape_more_dims[i + diff_dims]; + if (dl != 1 && dm != 1 && dl != dm) { + CV_Error(Error::StsBadSize, format("DNN/MatMul: invalid shape for broadcasting, shape_A[%zu]=%d, shape_B[%zu]=%d\n", i, shape_less_dims[i], i, shape_more_dims[i + diff_dims])); + } + + if (dm == 1) { + common_shape[i + diff_dims] = dl; + } + } + common_shape[common_shape.size() - 2] = M; + common_shape[common_shape.size() - 1] = N; + } else { + common_shape.resize(2); + common_shape[0] = M; + common_shape[1] = N; + } + + outputs.assign(1, common_shape); + return false; + } + + virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE { + opt.init(); + + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + + const auto A_shape = shape(inputs[0]), + B_shape = blobs.empty() ? shape(inputs[1]) : shape(blobs[0]), + C_shape = shape(outputs[0]); + helper.compute(trans_a, trans_b, A_shape, B_shape, C_shape); + + if (!blobs.empty()) { + fastGemmPackB(blobs[0], packed_input_B, trans_b, opt); + helper.updatePackedBOffsets(packed_input_B.size()); + } + } + + // works like Y = numpy.matmul(A, B) + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget), + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + + if (inputs_arr.depth() == CV_16S) + { + forward_fallback(inputs_arr, outputs_arr, internals_arr); + return; + } + + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + + const auto &A = inputs[0]; + auto &Y = outputs[0]; + + const auto *a = A.ptr(); + auto *y = Y.ptr(); + std::memset(y, 0, Y.total() * sizeof(float)); + + if (blobs.empty()) { + const auto &B = inputs[1]; + const auto *b = B.ptr(); + fastGemmBatch(helper.batch, helper.A_offsets.data(), helper.B_offsets.data(), helper.C_offsets.data(), + helper.M, helper.N, helper.K, alpha, a, helper.lda0, helper.lda1, + b, helper.ldb0, helper.ldb1, beta, y, helper.ldc, opt); + } else { + fastGemmBatch(helper.batch, helper.A_offsets.data(), helper.packed_B_offsets.data(), helper.C_offsets.data(), + helper.M, helper.N, helper.K, alpha, a, helper.lda0, helper.lda1, + packed_input_B.data(), beta, y, helper.ldc, opt); + } + } + +#ifdef HAVE_OPENCL + bool forward_ocl(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, InputArrayOfArrays internals) { + std::vector inputs; + std::vector outputs; + + bool use_half = (inputs_arr.depth() == CV_16S); + inputs_arr.getUMatVector(inputs); + outputs_arr.getUMatVector(outputs); + + const auto &input_A = inputs[0]; + UMat input_B; + if (blobs.empty()) { + input_B = inputs[1]; + } else { + blobs[0].copyTo(input_B); + } + auto &output = outputs[0]; + + int M = static_cast(helper.M), + N = static_cast(helper.N), + K = static_cast(helper.K), + batch = static_cast(helper.batch); + int batch_A = total(shape(input_A)) / (M * K), + batch_B = total(shape(input_B)) / (N * K); + MatShape new_shape_A{batch_A, M * K}, new_shape_B{batch_B, N * K}, new_shape_output{batch, M * N}; + + const auto input_A_2d = input_A.reshape(1, new_shape_A.size(), &new_shape_A[0]), + input_B_2d = input_B.reshape(1, new_shape_B.size(), &new_shape_B[0]); + auto output_2d = output.reshape(1, new_shape_output.size(), &new_shape_output[0]); + UMat A, B, C, A_fp32, B_fp32, C_fp32; + for (int i = 0; i < batch; i++) { + A = input_A_2d.row(helper.A_rows[i]).reshape(1, trans_a ? K : M); + B = input_B_2d.row(helper.B_rows[i]).reshape(1, trans_b ? K : N); + C = output_2d.row(helper.C_rows[i]).reshape(1, M); + + if (trans_a) { + A = A.t(); + } + if (trans_b) { + B = B.t(); + } + + if (use_half) { + convertFp16(A, A_fp32); + convertFp16(B, B_fp32); + convertFp16(C, C_fp32); + } else { + A_fp32 = A; + B_fp32 = B; + C_fp32 = C; + } + + cv::gemm(A_fp32, B_fp32, 1.f, noArray(), 0.f, C_fp32); + if (use_half) { + convertFp16(A_fp32, A); + convertFp16(B_fp32, B); + convertFp16(C_fp32, C); + } + } + return true; + } +#endif // HAVE_OPENCL + +#ifdef HAVE_DNN_NGRAPH + virtual Ptr initNgraph(const std::vector >& inputs, + const std::vector >& nodes) CV_OVERRIDE { + auto& input_A_node = nodes[0].dynamicCast()->node; + std::shared_ptr matmul; + + if (nodes.size() == 2) { + auto &input_B_node = nodes[1].dynamicCast()->node; + matmul = std::make_shared(input_A_node, input_B_node, trans_a, trans_b); + } else { + auto input_B_shape = getShape(blobs[0]); + auto input_B_node = std::make_shared(ngraph::element::f32, input_B_shape, blobs[0].data); + matmul = std::make_shared(input_A_node, input_B_node, trans_a, trans_b); + } + + return Ptr(new InfEngineNgraphNode(matmul)); + } +#endif // HAVE_DNN_NGRAPH + +#ifdef HAVE_VULKAN + virtual Ptr initVkCom(const std::vector > &inputs, + std::vector > &outputs) CV_OVERRIDE { + auto input_A_wrapper = inputs[0].dynamicCast(); + auto output_wrapper = outputs[0].dynamicCast(); + + const auto input_A_shape = shape(*input_A_wrapper->getMat()); + const auto output_shape = shape(*output_wrapper->getMat()); + if (output_shape.size() != 2) { + return Ptr(); + } + + std::vector constants; + + if (!blobs.empty()) { + constants.push_back(blobs[0]); + } + + Ptr op = new vkcom::OpMatMul(constants, input_A_shape[0], input_A_shape[1], output_shape[1]); + return Ptr(new VkComBackendNode(inputs, op, outputs)); + } +#endif + +#ifdef HAVE_CUDA + Ptr initCUDA(void *context_, + const std::vector>& inputs, + const std::vector>& outputs) override { + auto context = reinterpret_cast(context_); + auto input_B = blobs.empty() ? Mat() : blobs[0]; + + CV_CheckFalse(helper.empty(), "DNN/MatMul/CUDA: MatMulHelper is not initialized"); + + return make_cuda_node(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), input_B, trans_a, trans_b, helper.A_offsets, helper.B_offsets, helper.C_offsets, helper.batch); + } +#endif // HAVE_CUDA + +#ifdef HAVE_CANN + virtual Ptr initCann(const std::vector > &inputs, + const std::vector > &outputs, + const std::vector >& nodes) CV_OVERRIDE { + auto input_A_wrapper = inputs[0].dynamicCast(); + auto input_A_desc = input_A_wrapper->getTensorDesc(); + auto input_A_node = nodes[0].dynamicCast()->getOp(); + + auto op = std::make_shared(name); + + // set attributes + op->set_attr_adj_x1(trans_a); + op->set_attr_adj_x2(trans_b); + + // set inputs + // set inputs : x1 + op->set_input_x1_by_name(*input_A_node, input_A_wrapper->name.c_str()); + op->update_input_desc_x1(*input_A_desc); + // set inputs : x2 + if (blobs.empty()) { // varaible input B + auto input_B_wrapper = inputs[1].dynamicCast(); + auto input_B_desc = input_B_wrapper->getTensorDesc(); + auto input_B_node = nodes[1].dynamicCast()->getOp(); + op->set_input_x2_by_name(*input_B_node, "y"); + op->update_input_desc_x2(*input_B_desc); + } else { // constant input B + auto B = blobs[0]; + auto const_B_node = std::make_shared(B.data, B.type(), shape(B), cv::format("%s_B", name.c_str())); + op->set_input_x2_by_name(*(const_B_node->getOp()), "y"); + op->update_input_desc_x2(*(const_B_node->getTensorDesc())); + } + + // set outputs + auto output_desc = std::make_shared(ge::Shape(), ge::FORMAT_NCHW, ge::DT_FLOAT); + op->update_output_desc_y(*output_desc); + return Ptr(new CannBackendNode(op)); + } +#endif // HAVE_CANN + + private: + bool trans_a; + bool trans_b; + float alpha; + float beta; + + std::vector packed_input_B; + + FastGemmOpt opt; + MatMulHelper helper; +}; + +Ptr MatMulLayer::create(const LayerParams& params) +{ + return makePtr(params); +} + +}} // cv::dnn diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp index bd01aa095b..eee8b5828e 100644 --- a/modules/dnn/src/onnx/onnx_importer.cpp +++ b/modules/dnn/src/onnx/onnx_importer.cpp @@ -1957,50 +1957,33 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr addLayer(layerParams, node_proto); } -void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_) -{ - opencv_onnx::NodeProto node_proto = node_proto_; - CV_Assert(node_proto.input_size() == 2); - layerParams.type = "InnerProduct"; - layerParams.set("bias_term", false); - int firstInpDims, secondInpDims; +void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_) { + auto node_proto = node_proto_; + CV_CheckEQ(node_proto.input_size(), 2, "ONNXImporter/MatMul: two inputs required"); - if (constBlobs.find(node_proto.input(0)) != constBlobs.end()) - { - Mat blob = getBlob(node_proto, 0); - firstInpDims = blob.dims; - LayerParams constParams; - constParams.name = layerParams.name + "/const_0"; - constParams.type = "Const"; - constParams.blobs.push_back(blob); + for (int i = 0; i < node_proto.input_size(); i++) { + if (constBlobs.find(node_proto.input(i)) == constBlobs.end()) { + continue; + } - opencv_onnx::NodeProto tmpProto; - tmpProto.add_output(constParams.name); - addLayer(constParams, tmpProto); + Mat blob = getBlob(node_proto, i); - node_proto.set_input(0, constParams.name); + if (i == 1) { + layerParams.blobs.push_back(blob); + } else { + LayerParams const_params; + const_params.name = node_proto.input(i); + const_params.type = "Const"; + const_params.blobs.push_back(blob); + + opencv_onnx::NodeProto const_node_proto; + const_node_proto.add_output(const_params.name); + addLayer(const_params, const_node_proto); + + node_proto.set_input(i, const_params.name); + } } - else - firstInpDims = outShapes[node_proto.input(0)].size(); - if (constBlobs.find(node_proto.input(1)) != constBlobs.end()) - { - Mat blob = getBlob(node_proto, 1); - Mat transBlob; - secondInpDims = blob.dims; - // create order transposing last 2 dimensions - std::vector order(secondInpDims); - std::iota(order.begin(), order.end(), 0); - std::swap(order[secondInpDims - 2], order[secondInpDims - 1]); - transposeND(blob, order, transBlob); - layerParams.blobs.push_back(transBlob); - int numOutput = layerParams.blobs[0].total(0, secondInpDims - 1); - layerParams.set("num_output", numOutput); - layerParams.set("is_matmul", secondInpDims > 2); - } else - secondInpDims = outShapes[node_proto.input(1)].size(); - - layerParams.set("axis", firstInpDims - 1); addLayer(layerParams, node_proto); }