diff --git a/modules/dnn/src/cuda4dnn/csl/cublas.hpp b/modules/dnn/src/cuda4dnn/csl/cublas.hpp index 96cf70fab9..65e41a1399 100644 --- a/modules/dnn/src/cuda4dnn/csl/cublas.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cublas.hpp @@ -425,8 +425,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu const auto batch_count = static_cast(batchCount); - AutoBuffer buffer(3 * batch_count); - auto A_slices = (half**)(buffer.data()); + AutoBuffer buffer(3 * batch_count); + auto A_slices = buffer.data(); auto B_slices = A_slices + batch_count; auto C_slices = B_slices + batch_count; // collect A, B and C slices @@ -438,18 +438,18 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu const half **dev_A_slices = 0, **dev_B_slices = 0; half **dev_C_slices = 0; - cudaMalloc((void**)&dev_A_slices, batch_count * sizeof(half*)); - cudaMalloc((void**)&dev_B_slices, batch_count * sizeof(half*)); - cudaMalloc((void**)&dev_C_slices, batch_count * sizeof(half*)); - cudaMemcpy(dev_A_slices, A_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice); - cudaMemcpy(dev_B_slices, B_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice); - cudaMemcpy(dev_C_slices, C_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice); + CUDA4DNN_CHECK_CUDA(cudaMalloc((void**)&dev_A_slices, batch_count * sizeof(half*))); + CUDA4DNN_CHECK_CUDA(cudaMalloc((void**)&dev_B_slices, batch_count * sizeof(half*))); + CUDA4DNN_CHECK_CUDA(cudaMalloc((void**)&dev_C_slices, batch_count * sizeof(half*))); + CUDA4DNN_CHECK_CUDA(cudaMemcpy(dev_A_slices, A_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice)); + CUDA4DNN_CHECK_CUDA(cudaMemcpy(dev_B_slices, B_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice)); + CUDA4DNN_CHECK_CUDA(cudaMemcpy(dev_C_slices, C_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice)); CUDA4DNN_CHECK_CUBLAS(cublasHgemmBatched(handle.get(), opa, opb, iM, iN, iK, &alpha, dev_A_slices, ilda, dev_B_slices, ildb, &beta, dev_C_slices, ildc, batch_count)); - cudaFree(dev_A_slices); - cudaFree(dev_B_slices); - cudaFree(dev_C_slices); + CUDA4DNN_CHECK_CUDA(cudaFree(dev_A_slices)); + CUDA4DNN_CHECK_CUDA(cudaFree(dev_B_slices)); + CUDA4DNN_CHECK_CUDA(cudaFree(dev_C_slices)); } template <> inline @@ -475,8 +475,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu const auto batch_count = static_cast(batchCount); - AutoBuffer buffer(3 * batch_count); - auto A_slices = (float**)(buffer.data()); + AutoBuffer buffer(3 * batch_count); + auto A_slices = buffer.data(); auto B_slices = A_slices + batch_count; auto C_slices = B_slices + batch_count; // collect A, B and C slices @@ -488,19 +488,19 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu const float **dev_A_slices = 0, **dev_B_slices = 0; float **dev_C_slices = 0; - cudaMalloc((void**)&dev_A_slices, batch_count * sizeof(float*)); - cudaMalloc((void**)&dev_B_slices, batch_count * sizeof(float*)); - cudaMalloc((void**)&dev_C_slices, batch_count * sizeof(float*)); - cudaMemcpy(dev_A_slices, A_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice); - cudaMemcpy(dev_B_slices, B_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice); - cudaMemcpy(dev_C_slices, C_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice); + CUDA4DNN_CHECK_CUDA(cudaMalloc((void**)&dev_A_slices, batch_count * sizeof(float*))); + CUDA4DNN_CHECK_CUDA(cudaMalloc((void**)&dev_B_slices, batch_count * sizeof(float*))); + CUDA4DNN_CHECK_CUDA(cudaMalloc((void**)&dev_C_slices, batch_count * sizeof(float*))); + CUDA4DNN_CHECK_CUDA(cudaMemcpy(dev_A_slices, A_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice)); + CUDA4DNN_CHECK_CUDA(cudaMemcpy(dev_B_slices, B_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice)); + CUDA4DNN_CHECK_CUDA(cudaMemcpy(dev_C_slices, C_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice)); // cuBLAS is column-major CUDA4DNN_CHECK_CUBLAS(cublasSgemmBatched(handle.get(), opa, opb, iM, iN, iK, &alpha, dev_A_slices, ilda, dev_B_slices, ildb, &beta, dev_C_slices, ildc, batch_count)); - cudaFree(dev_A_slices); - cudaFree(dev_B_slices); - cudaFree(dev_C_slices); + CUDA4DNN_CHECK_CUDA(cudaFree(dev_A_slices)); + CUDA4DNN_CHECK_CUDA(cudaFree(dev_B_slices)); + CUDA4DNN_CHECK_CUDA(cudaFree(dev_C_slices)); } }}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */ diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index e58d83cdbd..f8187e43fb 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -1002,6 +1002,10 @@ TEST_P(Test_ONNX_layers, MatMul_init_bcast) testONNXModels("matmul_init_bcast"); } +TEST_P(Test_ONNX_layers, MatMul_bcast_3dx2d) { + testONNXModels("matmul_bcast"); +} + TEST_P(Test_ONNX_layers, MatMulAdd) { #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)