mirror of
https://github.com/opencv/opencv.git
synced 2024-11-24 03:00:14 +08:00
Merge pull request #26029 from fengyuentau:dnn/fix_cuda_matmul_crash
This commit is contained in:
commit
78630ddc67
@ -425,8 +425,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
|
|||||||
|
|
||||||
const auto batch_count = static_cast<int>(batchCount);
|
const auto batch_count = static_cast<int>(batchCount);
|
||||||
|
|
||||||
AutoBuffer<half> buffer(3 * batch_count);
|
AutoBuffer<half*> buffer(3 * batch_count);
|
||||||
auto A_slices = (half**)(buffer.data());
|
auto A_slices = buffer.data();
|
||||||
auto B_slices = A_slices + batch_count;
|
auto B_slices = A_slices + batch_count;
|
||||||
auto C_slices = B_slices + batch_count;
|
auto C_slices = B_slices + batch_count;
|
||||||
// collect A, B and C slices
|
// collect A, B and C slices
|
||||||
@ -438,18 +438,18 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
|
|||||||
|
|
||||||
const half **dev_A_slices = 0, **dev_B_slices = 0;
|
const half **dev_A_slices = 0, **dev_B_slices = 0;
|
||||||
half **dev_C_slices = 0;
|
half **dev_C_slices = 0;
|
||||||
cudaMalloc((void**)&dev_A_slices, batch_count * sizeof(half*));
|
CUDA4DNN_CHECK_CUDA(cudaMalloc((void**)&dev_A_slices, batch_count * sizeof(half*)));
|
||||||
cudaMalloc((void**)&dev_B_slices, batch_count * sizeof(half*));
|
CUDA4DNN_CHECK_CUDA(cudaMalloc((void**)&dev_B_slices, batch_count * sizeof(half*)));
|
||||||
cudaMalloc((void**)&dev_C_slices, batch_count * sizeof(half*));
|
CUDA4DNN_CHECK_CUDA(cudaMalloc((void**)&dev_C_slices, batch_count * sizeof(half*)));
|
||||||
cudaMemcpy(dev_A_slices, A_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice);
|
CUDA4DNN_CHECK_CUDA(cudaMemcpy(dev_A_slices, A_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice));
|
||||||
cudaMemcpy(dev_B_slices, B_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice);
|
CUDA4DNN_CHECK_CUDA(cudaMemcpy(dev_B_slices, B_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice));
|
||||||
cudaMemcpy(dev_C_slices, C_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice);
|
CUDA4DNN_CHECK_CUDA(cudaMemcpy(dev_C_slices, C_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice));
|
||||||
|
|
||||||
CUDA4DNN_CHECK_CUBLAS(cublasHgemmBatched(handle.get(), opa, opb, iM, iN, iK, &alpha, dev_A_slices, ilda, dev_B_slices, ildb, &beta, dev_C_slices, ildc, batch_count));
|
CUDA4DNN_CHECK_CUBLAS(cublasHgemmBatched(handle.get(), opa, opb, iM, iN, iK, &alpha, dev_A_slices, ilda, dev_B_slices, ildb, &beta, dev_C_slices, ildc, batch_count));
|
||||||
|
|
||||||
cudaFree(dev_A_slices);
|
CUDA4DNN_CHECK_CUDA(cudaFree(dev_A_slices));
|
||||||
cudaFree(dev_B_slices);
|
CUDA4DNN_CHECK_CUDA(cudaFree(dev_B_slices));
|
||||||
cudaFree(dev_C_slices);
|
CUDA4DNN_CHECK_CUDA(cudaFree(dev_C_slices));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> inline
|
template <> inline
|
||||||
@ -475,8 +475,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
|
|||||||
|
|
||||||
const auto batch_count = static_cast<int>(batchCount);
|
const auto batch_count = static_cast<int>(batchCount);
|
||||||
|
|
||||||
AutoBuffer<float> buffer(3 * batch_count);
|
AutoBuffer<float*> buffer(3 * batch_count);
|
||||||
auto A_slices = (float**)(buffer.data());
|
auto A_slices = buffer.data();
|
||||||
auto B_slices = A_slices + batch_count;
|
auto B_slices = A_slices + batch_count;
|
||||||
auto C_slices = B_slices + batch_count;
|
auto C_slices = B_slices + batch_count;
|
||||||
// collect A, B and C slices
|
// collect A, B and C slices
|
||||||
@ -488,19 +488,19 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
|
|||||||
|
|
||||||
const float **dev_A_slices = 0, **dev_B_slices = 0;
|
const float **dev_A_slices = 0, **dev_B_slices = 0;
|
||||||
float **dev_C_slices = 0;
|
float **dev_C_slices = 0;
|
||||||
cudaMalloc((void**)&dev_A_slices, batch_count * sizeof(float*));
|
CUDA4DNN_CHECK_CUDA(cudaMalloc((void**)&dev_A_slices, batch_count * sizeof(float*)));
|
||||||
cudaMalloc((void**)&dev_B_slices, batch_count * sizeof(float*));
|
CUDA4DNN_CHECK_CUDA(cudaMalloc((void**)&dev_B_slices, batch_count * sizeof(float*)));
|
||||||
cudaMalloc((void**)&dev_C_slices, batch_count * sizeof(float*));
|
CUDA4DNN_CHECK_CUDA(cudaMalloc((void**)&dev_C_slices, batch_count * sizeof(float*)));
|
||||||
cudaMemcpy(dev_A_slices, A_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice);
|
CUDA4DNN_CHECK_CUDA(cudaMemcpy(dev_A_slices, A_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice));
|
||||||
cudaMemcpy(dev_B_slices, B_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice);
|
CUDA4DNN_CHECK_CUDA(cudaMemcpy(dev_B_slices, B_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice));
|
||||||
cudaMemcpy(dev_C_slices, C_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice);
|
CUDA4DNN_CHECK_CUDA(cudaMemcpy(dev_C_slices, C_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice));
|
||||||
|
|
||||||
// cuBLAS is column-major
|
// cuBLAS is column-major
|
||||||
CUDA4DNN_CHECK_CUBLAS(cublasSgemmBatched(handle.get(), opa, opb, iM, iN, iK, &alpha, dev_A_slices, ilda, dev_B_slices, ildb, &beta, dev_C_slices, ildc, batch_count));
|
CUDA4DNN_CHECK_CUBLAS(cublasSgemmBatched(handle.get(), opa, opb, iM, iN, iK, &alpha, dev_A_slices, ilda, dev_B_slices, ildb, &beta, dev_C_slices, ildc, batch_count));
|
||||||
|
|
||||||
cudaFree(dev_A_slices);
|
CUDA4DNN_CHECK_CUDA(cudaFree(dev_A_slices));
|
||||||
cudaFree(dev_B_slices);
|
CUDA4DNN_CHECK_CUDA(cudaFree(dev_B_slices));
|
||||||
cudaFree(dev_C_slices);
|
CUDA4DNN_CHECK_CUDA(cudaFree(dev_C_slices));
|
||||||
}
|
}
|
||||||
|
|
||||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */
|
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */
|
||||||
|
@ -1002,6 +1002,10 @@ TEST_P(Test_ONNX_layers, MatMul_init_bcast)
|
|||||||
testONNXModels("matmul_init_bcast");
|
testONNXModels("matmul_init_bcast");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_P(Test_ONNX_layers, MatMul_bcast_3dx2d) {
|
||||||
|
testONNXModels("matmul_bcast");
|
||||||
|
}
|
||||||
|
|
||||||
TEST_P(Test_ONNX_layers, MatMulAdd)
|
TEST_P(Test_ONNX_layers, MatMulAdd)
|
||||||
{
|
{
|
||||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
|
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
|
||||||
|
Loading…
Reference in New Issue
Block a user